From c7d1c0c6ead2e9a5917031158ac7c212b0a4cc97 Mon Sep 17 00:00:00 2001
From: ValVau <109755950+ValVau@users.noreply.github.com>
Date: Mon, 8 May 2023 14:59:22 +0200
Subject: [PATCH 1/3] Add files via upload

These files are a part of the current VHcc analysis with arrays saving method and BDT training.
---
 Zll_process_newHist_pandas.py                 | 1542 +++++++++++++++
 ...s_newHist_pandas_small_update_isolation.py | 1687 +++++++++++++++++
 cfg_VHcc_mod.py                               |  163 ++
 xgb_test.py                                   |  456 +++++
 xgb_test_no_coffea.py                         |  605 ++++++
 xgb_test_only_xgb.py                          |  361 ++++
 xgb_test_only_xgb_no_coffea.py                |  399 ++++
 xgb_test_only_xgb_reloaded.py                 |  294 +++
 xgb_test_only_xgb_reloaded_no_coffea.py       |  287 +++
 xgb_test_only_xgb_reloaded_no_coffea_var.py   |  404 ++++
 10 files changed, 6198 insertions(+)
 create mode 100644 Zll_process_newHist_pandas.py
 create mode 100644 Zll_process_newHist_pandas_small_update_isolation.py
 create mode 100644 cfg_VHcc_mod.py
 create mode 100644 xgb_test.py
 create mode 100644 xgb_test_no_coffea.py
 create mode 100644 xgb_test_only_xgb.py
 create mode 100644 xgb_test_only_xgb_no_coffea.py
 create mode 100644 xgb_test_only_xgb_reloaded.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_var.py

diff --git a/Zll_process_newHist_pandas.py b/Zll_process_newHist_pandas.py
new file mode 100644
index 0000000..a35a4d4
--- /dev/null
+++ b/Zll_process_newHist_pandas.py
@@ -0,0 +1,1542 @@
+import csv
+from curses import meta
+from dataclasses import dataclass
+import gzip
+import pickle, os, sys, mplhep as hep, numpy as np
+from select import select
+
+import json
+
+#from coffea import hist, processor # ToDo: move to the better hist
+from coffea import processor # ToDo: move to the better hist
+import hist
+from hist import Hist
+from coffea.nanoevents.methods import vector
+import awkward as ak
+from VHcc.utils.correction import jec,muSFs,eleSFs,init_corr
+from coffea.lumi_tools import LumiMask
+from coffea.analysis_tools import Weights, PackedSelection
+from functools import partial
+# import numba
+from VHcc.helpers.util import reduce_and, reduce_or, nano_mask_or, get_ht, normalize, make_p4
+
+def empty_column_accumulator():
+    #return processor.column_accumulator(np.array([],dtype=object))
+    return processor.column_accumulator(np.array([],dtype=np.float64))
+def array_accumulator():
+    return processor.defaultdict_accumulator(empty_column_accumulator)
+
+def mT(obj1,obj2):
+    return np.sqrt(2.*obj1.pt*obj2.pt*(1.-np.cos(obj1.phi-obj2.phi)))
+def flatten(ar): # flatten awkward into a 1d array to hist
+    return ak.flatten(ar, axis=None)
+def normalize(val, cut):
+    if cut is None:
+        ar = ak.to_numpy(ak.fill_none(val, np.nan))
+        return ar
+    else:
+        ar = ak.to_numpy(ak.fill_none(val[cut], np.nan))
+        return ar
+
+def read_json(path):
+    f = open(path)
+    data = json.load(f)
+    return data
+
+def dataset_name_to_number(dataset, year):
+    samples_path = 'src/VHcc/metadata/sample_info_' + year + '_reversed'
+
+    samples = read_json(samples_path+'.json')
+    
+    return samples[dataset]['type'], samples[dataset]['doJetFlavorSplit']
+
+def dataset_categories(year):
+    map_path = 'src/VHcc/metadata/mergemap_' + year + '_Zll'
+    
+    samples = read_json(map_path+'.json').values()
+    all_datasets = [item for sublist in samples for item in sublist]
+    
+    return all_datasets
+
+def get_info_dict(year):
+    with open(f'src/VHcc/metadata/sample_info_{year}.json') as si:
+        info = json.load(si)
+        info_dict={}
+        for obj in info:
+            #print(obj)
+            info_dict[obj]=info[obj]['name']
+        return info_dict
+
+class NanoProcessor(processor.ProcessorABC):
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self._year = self.cfg.dataset["year"]
+        self._campaign = self.cfg.dataset["campaign"]
+
+        self._version=self.cfg.userconfig['version'] # only because the new runner etc. needs that, not used later
+        self._export_array = True # if 'test' in self._version else False
+        self._debug = False #True
+        
+        # paths from table 1 and 2 of the AN_2020_235
+        
+        # l l
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3230-L3337
+        self._mumu_hlt = {
+            '2016': [
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL',
+                'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ'
+            ],
+            '2017': [
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1
+            ],
+            '2018': [
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 but this is the only used one in 2018?!
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1
+            ],
+        }   
+    
+        self._ee_hlt = {
+            '2016': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ'
+            ],
+            '2017': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL',
+                #'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' # not in VHccAnalysis code
+            ],
+            '2018': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL'
+            ],
+        }  
+        
+        '''
+        # l nu
+        self._munu_hlt = {
+            '2016': [
+                'IsoMu24',
+                'IsoTkMu24'
+            ],
+            '2017': [
+                'IsoMu24',
+                'IsoMu27'
+            ],
+            '2018': [
+                'IsoMu24',
+                'IsoMu27'
+            ],
+        }   
+    
+        self._enu_hlt = {
+            '2016': [
+                'Ele27_eta2p1_WPTight_Gsf'
+            ],
+            '2017': [
+                'Ele32_WPTight_Gsf_L1DoubleEG',
+                'Ele32_WPTight_Gsf'
+            ],
+            '2018': [
+                'Ele32_WPTight_Gsf_L1DoubleEG',
+                'Ele32_WPTight_Gsf'#allowMissingBranch=1
+            ],
+        }  
+        
+        # nu nu
+        self._nunu_hlt = {
+            '2016': [
+                'PFMET110_PFMHT110_IDTight',
+                #'PFMET110_PFMHT120_IDTight', # found in hltbranches_2016.txt but not in AN, maybe redundant?
+                'PFMET170_NoiseCleaned',#allowMissingBranch=1
+                'PFMET170_BeamHaloCleaned',#allowMissingBranch=1
+                'PFMET170_HBHECleaned'
+            ],
+            '2017': [
+                'PFMET110_PFMHT110_IDTight',
+                'PFMET120_PFMHT120_IDTight',
+                'PFMET120_PFMHT120_IDTight_PFHT60',#allowMissingBranch=1
+                'PFMETTypeOne120_PFMHT120_IDTight'
+            ],
+            '2018': [
+                'PFMET110_PFMHT110_IDTight',
+                'PFMET120_PFMHT120_IDTight',
+                'PFMET120_PFMHT120_IDTight_PFHT60'#allowMissingBranch=1
+            ],
+        } 
+        
+        '''
+        
+        # differences between UL and EOY
+        # see https://twiki.cern.ch/twiki/bin/view/CMS/MissingETOptionalFiltersRun2
+        # also look at sec. 3.7.2
+        self._met_filters = {
+            '2016': {
+                'data': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    'eeBadScFilter',
+                ],
+                'mc': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'eeBadScFilter', # not suggested in EOY MC
+                ],
+            },
+            '2017': {
+                "data": [
+                    "goodVertices",
+                    "globalSuperTightHalo2016Filter",
+                    "HBHENoiseFilter",
+                    "HBHENoiseIsoFilter",
+                    "EcalDeadCellTriggerPrimitiveFilter",
+                    "BadPFMuonFilter",
+                    "BadPFMuonDzFilter",
+                    "hfNoisyHitsFilter",
+                    "eeBadScFilter",
+                    "ecalBadCalibFilter",
+                ],
+                "mc": [
+                    "goodVertices",
+                    "globalSuperTightHalo2016Filter",
+                    "HBHENoiseFilter",
+                    "HBHENoiseIsoFilter",
+                    "EcalDeadCellTriggerPrimitiveFilter",
+                    "BadPFMuonFilter",
+                    "BadPFMuonDzFilter",
+                    "hfNoisyHitsFilter",
+                    "eeBadScFilter",
+                    "ecalBadCalibFilter",
+                ],
+            },
+            '2018': {
+                'data': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'hfNoisyHitsFilter', # not in EOY
+                    'eeBadScFilter',
+                    'ecalBadCalibFilterV2',
+                ],
+                'mc': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'hfNoisyHitsFilter', # not in EOY
+                    #'eeBadScFilter', # not suggested in EOY MC
+                    'ecalBadCalibFilterV2',
+                ],
+            },
+        }
+        
+        # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/crab/crab_all.py#L33-36
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions16/13TeV/ReReco/Final/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions17/13TeV/ReReco/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt' 
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions18/13TeV/ReReco/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt'
+        # downloaded.
+        self._lumiMasks = {
+            '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'),
+            '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt'),
+            '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt')
+        }
+        
+        self._corr = init_corr(self._year)
+        
+        # Axes: Cat - what it is, a type of something, described with words
+        #       Bin - how much of something, numerical things
+        #
+        #   --> Some axes are already connected to specific objetcs, or to the event
+        #   --> Others are "building-blocks" that can be reused multiple times
+        
+        list_of_datasets = dataset_categories(self._year)
+        #print(list_of_datasets)
+        #sys.exit()
+        # Define axes
+        # Should read axes from NanoAOD config / metadata
+        #dataset_axis = hist.Cat("dataset", "Primary dataset")
+        dataset_axis = hist.axis.StrCategory([],      name="dataset", label="Primary dataset", growth=True)
+        # split V+jets sample & VZ signal, this is per event
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2276
+        #datasetSplit_axis = hist.Cat("datasetSplit", "Dataset split by flav", list_of_datasets)
+        datasetSplit_axis = hist.axis.StrCategory(list_of_datasets, name="datasetSplit", label="Dataset split by flav")
+        
+        # use hadronFlavour, necessary when applying btag scale factors (that depend on flavour)
+        # this one will be done per jet, can have values 0, 4, 5
+        #flav_axis = hist.Bin("flav", r"hadronFlavour",[0,1,4,5,6])
+        flav_axis = hist.axis.Variable([0,1,4,5,6], name="flav", label="hadronFlavour")
+        
+        #lepflav_axis = hist.Cat("lepflav",['ee','mumu'])
+        lepflav_axis = hist.axis.StrCategory(['ee','mumu'], name="lepflav", label="Lepton flav")
+        
+        regions = ['SR_2LL','SR_2LH',
+                   'CR_Zcc_2LL','CR_Zcc_2LH',
+                   'CR_Z_LF_2LL','CR_Z_LF_2LH',
+                   'CR_Z_HF_2LL','CR_Z_HF_2LH',
+                   'CR_t_tbar_2LL','CR_t_tbar_2LH']
+        #region_axis = hist.Cat("region",regions)
+        region_axis = hist.axis.StrCategory(regions, name="region", label="Region")
+        
+        # Events
+        njet_axis  = hist.axis.Regular(13, -.5, 12.5, name="nj", label="N jets") #hist.Bin("nj",  r"N jets", 13, -.5, 12.5)
+        
+        nAddJets_axis  = hist.axis.Regular(11, -.5, 10.5, name="nAddJets302p5_puid", label="N additional jets")
+        #hist.Bin("nAddJets302p5_puid",  r"N additional jets", 11, -.5, 10.5)
+        nAddJets_FSRsub_axis  = hist.axis.Regular(11, -.5, 10.5, name="nAddJetsFSRsub302p5_puid", label="N additional jets (FSR subtracted)")
+        #hist.Bin("nAddJetsFSRsub302p5_puid",  r"N additional jets (FSR subtracted)", 11, -.5, 10.5)
+        
+        #nbjet_axis = hist.Bin("nbj", r"N b jets",    [0,1,2,3,4,5])            
+        #ncjet_axis = hist.Bin("ncj", r"N c jets",    [0,1,2,3,4,5])
+        # kinematic variables       
+        pt_axis   = hist.axis.Regular(50, 0, 300, name="pt", label=r"$p_{T}$ [GeV]")
+        #hist.Bin("pt",   r" $p_{T}$ [GeV]", 50, 0, 300)
+        eta_axis  = hist.axis.Regular(25, -2.5, 2.5, name="eta", label=r"$\eta$")
+        #hist.Bin("eta",  r" $\eta$", 25, -2.5, 2.5)
+        phi_axis  = hist.axis.Regular(30, -3, 3, name="phi", label=r"$\phi$")
+        #hist.Bin("phi",  r" $\phi$", 30, -3, 3)
+        mass_axis = hist.axis.Regular(50, 0, 300, name="mass", label=r"$m$ [GeV]")
+        #hist.Bin("mass", r" $m$ [GeV]", 50, 0, 300)
+        mt_axis = hist.axis.Regular(30, 0, 300, name="mt", label=r"$m_{T}$ [GeV]")
+        #hist.Bin("mt", r" $m_{T}$ [GeV]", 30, 0, 300)
+        dr_axis = hist.axis.Regular(20, 0, 5, name="dr", label=r"$\Delta$R")
+        #hist.Bin("dr","$\Delta$R",20,0,5)
+        
+        # some more variables to check, which enter BDT
+        # need to revisit this later, because high Vpt and low Vpt can have different binning
+        jjVPtRatio_axis = hist.axis.Regular(15, 0, 2, name="jjVPtRatio", label=r"$p_{T}(jj) / $p_{T}(V)$ [GeV]")
+        #hist.Bin("jjVPtRatio",r"$p_{T}(jj) / $p_{T}(V)$ [GeV]",15,0,2)
+        
+        
+        #dphi_V_H_axis = hist.Bin("dphi_V_H","$\Delta\Phi(V, H)$",20,0,3.2)
+        # jet jet
+        #dr_j1_j2_axis = hist.Bin("dr_j1_j2","$\Delta R(j1,j2)$",20,0,5)
+        # jet jet
+        #dphi_j1_j2_axis = hist.Bin("dphi_j1_j2","$\Delta\Phi(j1,j2)$",15,-3.2,3.2)
+        #deta_j1_j2_axis = hist.Bin("deta_j1_j2","$\Delta\eta(j1,j2)$",15,0,3)
+        # lepton lepton
+        #dphi_l1_l2_axis = hist.Bin("dphi_l1_l2","$\Delta\Phi(l1,l2)$",15,0,3.2)
+        #deta_l1_l2_axis = hist.Bin("eta_l1_l2","$\Delta\eta(l1,l2)$",15,0,2.6)
+        # jet lepton
+        #dphi_j1_l1_axis = hist.Bin("dphi_j1_l1","$\Delta\Phi(j1,l1)$",15,0,3.2)
+        #dphi_j2_l1_axis = hist.Bin("dphi_j2_l1","$\Delta\Phi(j2,l1)$",15,0,3.2)
+        #dphi_j1_l2_axis = hist.Bin("dphi_j1_l2","$\Delta\Phi(j1,l2)$",15,0,3.2)
+        #dphi_j2_l2_axis = hist.Bin("dphi_j2_l2","$\Delta\Phi(j2,l2)$",15,0,3.2)
+        
+        # ToDo: several other variables can only be stored after kinfit
+        # e.g. here https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/python/kinfitter.py
+        # or https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L4670-L4995
+        
+        # weights are interesting as well
+        weight_axis = hist.axis.Regular(100, -5.001, 5.001, name="weight_full", label="weight_full")
+        #hist.Bin("weight_full","weight_full",100,-5.001, 5.001)
+        genweight_axis = hist.axis.Regular(100, -0.001, 0.001, name="genWeight", label="genWeight")
+        #hist.Bin("genWeight","genWeight",100,-0.001, 0.001)
+        sign_genweight_axis = hist.axis.Regular(100, -1.001, 1.001, name="genWeight_by_abs", label="genWeight/abs(genWeight)")
+        #hist.Bin("genWeight_by_abs","genWeight/abs(genWeight)",100,-1.001,1.001)
+        
+        
+        # MET vars
+        #signi_axis = hist.Bin("significance", r"MET $\sigma$",20,0,10)
+        #covXX_axis = hist.Bin("covXX",r"MET covXX",20,0,10)
+        #covXY_axis = hist.Bin("covXY",r"MET covXY",20,0,10)
+        #covYY_axis = hist.Bin("covYY",r"MET covYY",20,0,10)
+        #sumEt_axis = hist.Bin("sumEt", r" MET sumEt", 50, 0, 300)
+        
+        # ToDo: switch to this
+        # axis.StrCategory([], name='region', growth=True),
+        #disc_list = [ 'btagDeepCvL', 'btagDeepCvB','btagDeepFlavCvB','btagDeepFlavCvL']#,'particleNetAK4_CvL','particleNetAK4_CvB']
+        # As far as I can tell, we only need DeepFlav currently
+        ### In all of the older stuff use:
+        #disc_list = ['btagDeepFlavC','btagDeepFlavB','btagDeepFlavCvL','btagDeepFlavCvB']
+        ### With new stuff UL I use, use:
+        disc_list = ['btagDeepFlavCvL','btagDeepFlavCvB']
+        btag_axes = []
+        for d in disc_list:
+            # technically, -1 values are possible, but probably unlikely to matter much after event selection
+            btag_axes.append(hist.axis.Regular(20, 0, 1, name=d, label=d)
+                #hist.Bin(d, d , 20, 0, 1)
+            )  
+        #h = (
+        #    Hist.new.Reg(10, -5, 5, overflow=False, underflow=False, name="A")
+        #    .Bool(name="B")
+        #    .Var(range(10), name="C")
+        #    .Int(-5, 5, overflow=False, underflow=False, name="D")
+        #    .IntCat(range(10), name="E")
+        #    .StrCat(["T", "F"], name="F")
+        #    .Double()
+        #)
+        #print(type(dataset_axis))
+        #print(type(lepflav_axis))
+        #print(type(flav_axis))
+        #print(type(njet_axis))
+        #print(type(hist.storage.Weight()))
+        #testHistA = Hist(dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, njet_axis, hist.storage.Weight())    
+        #testHist = Hist(
+        #            #dataset_axis,
+        #            #datasetSplit_axis,
+        #            #lepflav_axis,
+        #            #region_axis,
+        #            njet_axis,
+        #            hist.storage.Weight()
+        #        )    
+        _hist_event_dict = {
+                'nj'                       : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       njet_axis, hist.storage.Weight()),
+                'nAddJets302p5_puid'       : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       nAddJets_axis, hist.storage.Weight()),
+                'nAddJetsFSRsub302p5_puid' : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       nAddJets_FSRsub_axis, hist.storage.Weight()),
+               # 'weight_full'              : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        weight_axis, hist.storage.Weight()),
+               # 'genweight'                : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        genweight_axis, hist.storage.Weight()),
+               # 'sign_genweight'           : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        sign_genweight_axis, hist.storage.Weight()),
+                'jjVPtRatio'               : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       jjVPtRatio_axis, hist.storage.Weight())
+            
+                #'dphi_V_H'                 : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dphi_V_H_axis)
+                #'dr_j1_j2'                 : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dr_j1_j2_axis)
+                #'dphi_j1_j2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_j2_axis)
+                #'deta_j1_j2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, deta_j1_j2_axis)
+                #'dphi_l1_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_l1_l2_axis)
+                #'dphi_j1_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_l2_axis)
+                #'dphi_j2_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j2_l2_axis)
+                
+                #'sampleFlavSplit'  : Hist( dataset_axis,  lepflav_axis, region_axis, sampleFlavSplit_axis),
+                #'nbj' : Hist( dataset_axis, lepflav_axis, region_axis, nbjet_axis),
+                #'ncj' : Hist( dataset_axis, lepflav_axis, region_axis, ncjet_axis),
+                #'hj_dr'  : Hist( dataset_axis, lepflav_axis, region_axis, dr_axis),
+                #'MET_sumEt' : Hist( dataset_axis, lepflav_axis, region_axis, sumEt_axis),
+                #'MET_significance' : Hist( dataset_axis, lepflav_axis, region_axis, signi_axis),
+                #'MET_covXX' : Hist( dataset_axis, lepflav_axis, region_axis, covXX_axis),
+                #'MET_covXY' : Hist( dataset_axis, lepflav_axis, region_axis, covXY_axis),
+                #'MET_covYY' : Hist( dataset_axis, lepflav_axis, region_axis, covYY_axis),
+                #'MET_phi' : Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'MET_pt' : Hist( dataset_axis, lepflav_axis, region_axis, pt_axis),
+                #'mT1' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'mT2' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'mTh':Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'dphi_lep1':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'dphi_lep2':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'dphi_ll':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+            }
+        
+        # jets will be ordered by DeepJet (which is DeepFlav for historical reasons)
+        objects=['leading_jetflav','subleading_jetflav','lep1','lep2','ll','jj']
+        
+        for i in objects:
+            # distinguish between jets and other objects, as the structure for jets contains additional flavour axis
+            if 'jet' in i: 
+                _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis,
+                                                         lepflav_axis,
+                                                         region_axis,
+                                                         flav_axis,
+                                                         pt_axis, hist.storage.Weight())
+                _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          flav_axis,
+                                                          eta_axis, hist.storage.Weight())
+                _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          flav_axis,
+                                                          phi_axis, hist.storage.Weight())
+                _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis,
+                                                           lepflav_axis,
+                                                           region_axis,
+                                                           flav_axis,
+                                                           mass_axis, hist.storage.Weight())
+            else:
+                _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis,
+                                                         lepflav_axis,
+                                                         region_axis,
+                                                         pt_axis, hist.storage.Weight())
+                _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          eta_axis, hist.storage.Weight())
+                _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          phi_axis, hist.storage.Weight())
+                _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis,
+                                                           lepflav_axis,
+                                                           region_axis,
+                                                           mass_axis, hist.storage.Weight())
+        
+        # more information on the discriminators is stored for the first two jets,
+        # ordered by DeepJet CvL discriminator and called "leading" and "subleading"
+        for disc, axis in zip(disc_list,btag_axes):
+            _hist_event_dict["leading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis,
+                                                                       lepflav_axis,
+                                                                       region_axis,
+                                                                       flav_axis,
+                                                                       axis, hist.storage.Weight())
+            _hist_event_dict["subleading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis,
+                                                                          lepflav_axis,
+                                                                          region_axis,
+                                                                          flav_axis,
+                                                                          axis, hist.storage.Weight())
+            
+        self.event_hists = list(_hist_event_dict.keys())
+        
+        # this can be used to not only store histograms, but also features on a per-event basis (arrays)
+        if self._export_array:
+            _hist_event_dict['array'] = processor.defaultdict_accumulator(array_accumulator)
+        #self._accumulator = processor.dict_accumulator(
+        #    {**_hist_event_dict,   
+        #     #'cutflow': processor.defaultdict_accumulator(
+        #     #    partial(processor.defaultdict_accumulator, int))
+        #    })
+        #self._accumulator['sumw'] = processor.defaultdict_accumulator(float)
+        
+        #self._accumulator = processor.dict_accumulator(
+        #    {
+        #        observable: Hist.Hist(var_axis, name="Counts", storage="Weight")
+        #        for observable, var_axis in axis.items()
+        #        if observable != "dataset"
+        #    }
+        #)
+        #self._accumulator["cutflow"] = processor.defaultdict_accumulator(
+        #    partial(processor.defaultdict_accumulator, int)
+        #)
+        #self._accumulator["sumw"] = 0
+        
+        self.make_output = lambda: {
+            "cutflow": processor.defaultdict_accumulator(
+                partial(processor.defaultdict_accumulator, int)
+            ),
+            "sumw": 0,
+            **_hist_event_dict
+        }
+
+
+    @property
+    def accumulator(self):
+        return self._accumulator
+
+    def process(self, events):
+        #output = self.accumulator #.identity()
+        output = self.make_output()
+        dataset = events.metadata['dataset']
+        start = events.metadata['entrystart']
+        stop = events.metadata['entrystop']
+        output_location_list = []
+        filename = events.metadata['filename'].split('/')[-1].strip('.root')
+        #print(dataset)
+        # Q: could there be MC that does not have this attribute? Or is it always the case?
+        isRealData = not hasattr(events, "genWeight")
+        
+        # Done (externally): map from the lengthy dataset (path) to a more readable name
+        # Keep the long name only for data, because it contains the Run info (necessary to apply corrections)
+        if isRealData:
+            info_dict = get_info_dict(self._year)
+            dataset_long = dataset
+            dictname = dataset[1:].split('/')[0]
+            dataset = info_dict[dictname]
+        print(dataset)    
+        sample_type, doFlavSplit = dataset_name_to_number(dataset, self._year)
+        # length of events is used so many times later on, probably useful to just save it here and then refer to that
+        nEvents = len(events)
+        print('Number of events: ', nEvents)
+        
+        # As far as I understand, this looks like a neat way to give selections a name,
+        # while internally, there are boolean arrays for all events
+        selection = PackedSelection()
+        
+        
+        # this is either counting events in data with weight 1, or weighted (MC)
+        if isRealData:
+            output['sumw'] += nEvents
+        else:
+            # instead of taking the weights themselves, the sign is used:
+            # https://cms-talk.web.cern.ch/t/huge-event-weights-in-dy-powhegminnlo/8718/7
+            # although I initially had the same concerns as those raised in the thread,
+            # if not only the sign is different, but also the absolute values between events
+            # somehow it seems to average out, although I don't see why this is guaranteed
+            # must have to do with "LO without interference" where the values are indeed same
+            # and if they are not same, the differences are consired to be negligible
+            output['sumw'] += ak.sum(events.genWeight/abs(events.genWeight))
+            
+            
+        req_lumi=np.ones(nEvents, dtype='bool')
+        if isRealData: 
+            req_lumi=self._lumiMasks[self._year](events.run, events.luminosityBlock)
+        selection.add('lumi',ak.to_numpy(req_lumi))
+        del req_lumi
+        
+        
+        # AS: sort of the same thing as above, but now per entry
+        weights = Weights(nEvents, storeIndividual=True)
+        if isRealData:
+            weights.add('genweight',np.ones(nEvents))
+        else:
+            weights.add('genweight',events.genWeight/abs(events.genWeight))
+            # weights.add('puweight', compiled['2017_pileupweight'](events.Pileup.nPU))
+            
+            
+        ##############
+        if isRealData:
+            output['cutflow'][dataset]['all']  += nEvents
+            output['cutflow'][dataset]['all (weight 1)']  += nEvents
+        else:
+            output['cutflow'][dataset]['all']  += ak.sum(events.genWeight/abs(events.genWeight))
+            output['cutflow'][dataset]['all (weight 1)']  += nEvents
+            
+        
+        #trigger_met = np.zeros(nEvents, dtype='bool')
+
+        trigger_ee = np.zeros(nEvents, dtype='bool')
+        trigger_mm = np.zeros(nEvents, dtype='bool')
+
+        #trigger_e = np.zeros(nEvents, dtype='bool')
+        #trigger_m = np.zeros(nEvents, dtype='bool')
+        
+        #for t in self._nunu_hlt[self._year]:
+        #    # so that already seems to be the check for whether the path exists in the file or not
+        #    if t in events.HLT.fields:
+        #        trigger_met = trigger_met | events.HLT[t]
+
+        for t in self._mumu_hlt[self._year]:
+            if t in events.HLT.fields:
+                trigger_mm = trigger_mm | events.HLT[t]
+
+        for t in self._ee_hlt[self._year]:
+            if t in events.HLT.fields:
+                trigger_ee = trigger_ee | events.HLT[t]
+
+        #for t in self._munu_hlt[self._year]:
+        #    if t in events.HLT.fields:
+        #        trigger_m = trigger_m | events.HLT[t]
+
+        #for t in self._emu_hlt[self._year]:
+        #    if t in events.HLT.fields:
+        #        trigger_e = trigger_e | events.HLT[t]
+        
+        
+        selection.add('trigger_ee', ak.to_numpy(trigger_ee))
+        selection.add('trigger_mumu', ak.to_numpy(trigger_mm))
+        
+        
+        # apart from the comments above about EOY/UL, should be fine
+        metfilter = np.ones(nEvents, dtype='bool')
+        for flag in self._met_filters[self._year]['data' if isRealData else 'mc']:
+            metfilter &= np.array(events.Flag[flag])
+        selection.add('metfilter', metfilter)
+        del metfilter
+        
+        
+        
+        # Not strictly necessary for Zll
+        met = ak.zip({
+                    "pt":  events.MET.pt,
+                    "phi": events.MET.phi,
+                    "energy": events.MET.sumEt,
+                    }, with_name="PtEtaPhiMLorentzVector"
+                )
+        
+        
+        
+        split_by_flav = False
+        sampleFlavSplit = np.zeros(nEvents)
+        possible_flavSplits = ['already_split_sample']
+        selection.add('already_split_sample',sampleFlavSplit == 0)
+        if not isRealData and not self._debug:
+            if doFlavSplit == '1' and not (int(sample_type) >= 27 and int(sample_type) <= 39):
+                split_by_flav = True
+                # uses the same naming scheme as AT, although udbsg is counterintuitive (b? [sic!])
+                possible_flavSplits = ['_cc','_bb','_bc','_cl','_bl','_udbsg']
+                # =================================================================================
+                #
+                # #                       Split V+jets BG by flavour, via GenJet
+                #
+                # ---------------------------------------------------------------------------------
+                # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2228
+                gen_jet = events.GenJet
+
+                cGenJetTot = ak.sum((gen_jet.hadronFlavour == 4) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1)
+                bGenJetTot = ak.sum((gen_jet.hadronFlavour == 5) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1)
+
+                tag_cc = cGenJetTot >= 2
+                tag_bb = bGenJetTot >= 2
+                tag_bc = (bGenJetTot == 1) & (cGenJetTot == 1)
+                tag_cl = (cGenJetTot == 1) & (bGenJetTot == 0)
+                tag_bl = (bGenJetTot == 1) & (cGenJetTot == 0)
+                tag_ll = (cGenJetTot == 0) & (bGenJetTot == 0)
+                
+                sampleFlavSplit = 1 * tag_cc  +  2 * tag_bb  +  3 * tag_bc  +  4 * tag_cl  +  5 * tag_bl  +  6 * tag_ll 
+                selection.add('_cc',sampleFlavSplit == 1)
+                selection.add('_bb',sampleFlavSplit == 2)
+                selection.add('_bc',sampleFlavSplit == 3)
+                selection.add('_cl',sampleFlavSplit == 4)
+                selection.add('_bl',sampleFlavSplit == 5)
+                selection.add('_udbsg',sampleFlavSplit == 6) # tbf I don't know why it contains b
+            
+            #elif dataset in ['WZTo1L1Nu2Q', 'ZZTo2L2Q', 'ZZTo2Q2Nu']: # VZ signal datasets
+            elif int(sample_type) in [32,36,37]: # VZ signal datasets
+                split_by_flav = True
+                possible_flavSplits = ['cc','bb','ll']
+                # =================================================================================
+                #
+                # #                       Split VZ signal by flavour, via GenPart
+                #
+                # ---------------------------------------------------------------------------------
+                # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2229-L2264
+                gen_part = events.GenPart
+                
+                
+                Z_decay_mothers_A = (abs(gen_part.pdgId) == 23) & (gen_part.hasFlags('isLastCopy'))
+                
+                Z_decays = gen_part[Z_decay_mothers_A]
+                output['cutflow'][dataset]['GenPart VZ signal'] += ak.sum(Z_decay_mothers_A)
+                
+                n_b_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 5, axis=-1), axis=-1)
+                n_c_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 4, axis=-1), axis=-1)
+                
+                
+                
+                VZ_cc = (n_c_from_Z >= 2)
+                VZ_bb = (n_b_from_Z >= 2)
+                VZ_others = (~VZ_cc) & (~VZ_bb)
+                # 1, 2 and 3 identical to what was done in AnalysisTools! Do not confuse with BTV / hadron / parton flavour...
+                sampleFlavSplit = 1 * VZ_cc  +  2 * VZ_bb  +  3 * VZ_others
+                
+                #print(sampleFlavSplit.type)
+                
+                selection.add('cc',sampleFlavSplit == 1)
+                selection.add('bb',sampleFlavSplit == 2)
+                selection.add('ll',sampleFlavSplit == 3)
+            
+            elif int(sample_type) in [27,28,29,30,31,33,34,35,38,39]: 
+                possible_flavSplits = ['ll']
+                sampleFlavSplit = sampleFlavSplit + 3
+                selection.add('ll',sampleFlavSplit == 3)
+                split_by_flav = True
+                
+            # this is how it looked in AT for comparison:
+            '''
+            else if( cursample->doJetFlavorSplit
+                     && ( mInt("sampleIndex")==27 || mInt("sampleIndex")==28
+                      || mInt("sampleIndex")==29 || mInt("sampleIndex")==30
+                      || mInt("sampleIndex")==31 || mInt("sampleIndex")==33
+                      || mInt("sampleIndex")==34 || mInt("sampleIndex")==35
+                      || mInt("sampleIndex")==38 || mInt("sampleIndex")==39
+                      )
+                     ){
+                        *in["sampleIndex"] = mInt("sampleIndex")*100 + 3;
+            '''
+        
+        
+        
+        
+        
+        # =================================================================================
+        #
+        # #                       Reconstruct and preselect leptons
+        #
+        # ---------------------------------------------------------------------------------
+        
+        
+        # Adopt from https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440
+        # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/VHccProducer.py#L345-389
+        
+        # ## Muon cuts
+        ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2
+        #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)]
+        event_mu = events.Muon
+        # looseId >= 1 or looseId seems to be the same...
+        musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25))
+        # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L
+        event_mu = event_mu[musel]
+        event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)]
+        event_mu["lep_flav"] = 13*event_mu.charge
+        event_mu= ak.pad_none(event_mu,2,axis=1)
+        nmu = ak.sum(musel,axis=1)
+        # ToDo: PtCorrGeoFit
+        
+        # ## Electron cuts
+        ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2
+        #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)]
+        event_e = events.Electron
+        elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25))
+        # but 30GeV and WP80 for 1L
+        event_e = event_e[elesel]
+        # something I saw in a recent presentation, and also in AT code:
+        # https://indico.desy.de/event/34473/contributions/122201/attachments/76587/98753/RTG_Meeting_01_09_22.pdf
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/VHccAnalysis/PlotWithVarial/ZllHccLowPt.py#L256-L257
+        # is to require "good electrons", which means excluding some region (eta),
+        # I guess it has sth to do with transition between barrel / endcap?
+        event_e = event_e[(abs(event_e.eta) > 1.5660) | (abs(event_e.eta) < 1.4442)]
+        event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)]
+        event_e["lep_flav"] = 11*event_e.charge
+        event_e = ak.pad_none(event_e,2,axis=1)
+        nele = ak.sum(elesel,axis=1)
+        # sorting after selecting should be faster (less computations on average)
+        
+        # for this channel (Zll / 2L)
+        selection.add('lepsel',ak.to_numpy((nele==2)|(nmu==2)))
+        
+        
+        
+        #### build lepton pair(s)
+        good_leptons = ak.with_name(
+                ak.concatenate([ event_e, event_mu], axis=1),
+                "PtEtaPhiMCandidate", )
+        good_leptons = good_leptons[ak.argsort(good_leptons.pt, axis=1,ascending=False)]
+        leppair = ak.combinations(
+                good_leptons,
+                n=2,
+                replacement=False,
+                axis=-1,
+                fields=["lep1", "lep2"],
+            )
+        
+        ll_cand = ak.zip({
+                    "lep1" : leppair.lep1,
+                    "lep2" : leppair.lep2,
+                    "pt": (leppair.lep1+leppair.lep2).pt,
+                    "eta": (leppair.lep1+leppair.lep2).eta,
+                    "phi": (leppair.lep1+leppair.lep2).phi,
+                    "mass": (leppair.lep1+leppair.lep2).mass,
+                    }, with_name="PtEtaPhiMLorentzVector"
+                )
+        # probably there needs to be a cross-check that we don't include more than we want here,
+        # I know there is the option to truncate the array if more than 1 is found
+        # --> clip = True
+        ll_cand = ak.pad_none(ll_cand,1,axis=1)
+        
+        # there seem to be multiple ways to get the "one" ll_cand of interest
+        # - closest to Z-mass [makes sense]
+        #   I think others use this
+        # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases]
+        #   https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440    
+        
+        if (ak.count(ll_cand.pt)>0):
+            ll_cand  = ll_cand[ak.argsort(ll_cand.pt, axis=1,ascending=False)]
+            # try the second option here
+        # NOTE: Comment out to debug stuff
+        ll_cand = ll_cand[:, 0]
+            
+        
+            
+        # =================================================================================
+        #
+        # #                       Reconstruct and preselect jets
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # Apply correction:
+        if isRealData:
+            #print(dataset_long)
+            jets =  jec(events,events.Jet,dataset_long,self._year,self._corr)
+        else:
+            jets =  jec(events,events.Jet,dataset,self._year,self._corr)
+        #jets =  events.Jet
+        
+        # This was necessary for the FSR code
+        #jets = jets.mask[ak.num(jets) > 2]
+        
+        
+        
+        # For EOY: recalculate CvL & CvB here, because the branch does not exist in older files
+        # adapted from PostProcessor
+        def deepflavcvsltag(jet):
+            btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB)
+            return ak.where((jet.btagDeepFlavB >= 0.) & (jet.btagDeepFlavB < 1.) & (jet.btagDeepFlavC >= 0.) & (btagDeepFlavL >= 0.),
+                            jet.btagDeepFlavC/(1.-jet.btagDeepFlavB),
+                            (-1.) * ak.ones_like(jet.btagDeepFlavB))
+        
+        def deepflavcvsbtag(jet):
+            btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB)
+            return ak.where((jet.btagDeepFlavB > 0.) & (jet.btagDeepFlavC > 0.) & (btagDeepFlavL >= 0.),
+                            jet.btagDeepFlavC/(jet.btagDeepFlavC+jet.btagDeepFlavB),
+                            (-1.) * ak.ones_like(jet.btagDeepFlavB))
+        
+        # Alternative ways:
+        # - depending on the Nano version, there might already be bTagDeepFlavCvL available
+        # - one could instead use DeepCSV via bTagDeepCvL
+        # - not necessarily use CvL, other combination possible ( CvB | pt | BDT? )
+        
+        #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets)
+        #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets)
+        jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)]
+
+        
+        # Jets are considered only if the following identification conditions hold, as mentioned in AN
+        # - Here is some documentation related to puId and jetId:
+        #     https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJetID
+        #     https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID
+        jet_conditions = ((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
+                     | ((jets.pt>50) & (jets.jetId>5))
+        # Count how many jets exist that pass this selection
+        njet = ak.sum(jet_conditions,axis=1)
+        selection.add('jetsel',ak.to_numpy(njet>=2))
+        
+        
+        # =================================================================================
+        #
+        # #                                 FSR recovery
+        #
+        # ---------------------------------------------------------------------------------
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L841-L956
+        
+        # FSR jets are selected with slightly different criteria
+        fsr_conditions = (abs(jets.eta) < 3) & (jets.pt > 20) \
+                        & ak.all(jets.metric_table(ll_cand.lep1)>0.2) & ak.all(jets.metric_table(ll_cand.lep2)>0.2)
+        # Take the first two jets that pass the criteria and check the remaining ones,
+        # as well as potentially others, to get FSR jets:
+        pick2 = jets[ak.pad_none(ak.local_index(jets, 1)[jet_conditions], 2)[:, :2]]
+        others = jets[ak.concatenate([ak.pad_none(ak.local_index(jets, 1)[(jet_conditions) & (fsr_conditions)], 2)[:, 2:], 
+                                    ak.local_index(jets, 1)[(~jet_conditions) & (fsr_conditions)]
+                                   ], axis=1)]
+        
+
+        def find_fsr(leading, subleading, others, threshold=0.8):
+            mval1, (a1, b) = leading.metric_table(others, return_combinations=True)
+            mval2, (a2, b) = subleading.metric_table(others, return_combinations=True)
+
+            def res(mval, out):
+                order = ak.argsort(mval, axis=-1)
+                return out[order], mval[order]
+
+            out1, metric1 =  res(mval1, b)
+            out2, metric2 =  res(mval2, b)
+
+            out1 = out1.mask[(metric1 <= threshold) & (metric1 < metric2)]
+            out2 = out2.mask[(metric2 <= threshold) & (metric2 < metric1)]
+            #out2 = out2.mask[(metric1 <= threshold) & (metric2 < metric1)]
+            return out1[:, 0, ...], out2[:, 0, ...]
+
+        
+        missing = ~(ak.is_none(pick2[:, 0]) | ak.is_none(pick2[:, 1]))
+        pick2 = pick2.mask[missing]
+        others = others.mask[missing]
+
+        
+        leading, subleading = pick2[:, 0], pick2[:, 1]
+        fsr_leading, fsr_subleading = find_fsr(leading, subleading, others, threshold=0.8)
+
+        #print(leading.pt)
+        #print((leading + fsr_leading.sum()).pt)
+        
+        # To explicitly check that adding FSR does indeed have an effect
+        #print(ak.sum((leading + fsr_leading.sum()).pt != leading.pt))
+        
+        #print(leading.type)
+        
+        # Collect the (sub-)leading jets and their respective FSR jets in a new 4-vector
+        leading_with_fsr = ak.zip({
+                    "jet1" : leading,
+                    "jet2" : fsr_leading.sum(),
+                    "pt": (leading + fsr_leading.sum()).pt,
+                    "eta": (leading + fsr_leading.sum()).eta,
+                    "phi": (leading + fsr_leading.sum()).phi,
+                    "mass": (leading + fsr_leading.sum()).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        subleading_with_fsr = ak.zip({
+                    "jet1" : subleading,
+                    "jet2" : fsr_subleading.sum(),
+                    "pt": (subleading + fsr_subleading.sum()).pt,
+                    "eta": (subleading + fsr_subleading.sum()).eta,
+                    "phi": (subleading + fsr_subleading.sum()).phi,
+                    "mass": (subleading + fsr_subleading.sum()).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        
+        # (Maybe) one could calculate the angle between FSR & the "main" jet they correspond to
+        # - this would be correlated with the mass of the decaying p. via the dead-cone effect,
+        # - could be a discriminating variable at the event level.
+        
+        # =================================================================================
+        #
+        # #                       Build Higgs candidate w/ or w/o FSR
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # Build 4-vector from leading + subleading jets, with or without FSR
+        higgs_cand_no_fsr = ak.zip({
+                    "jet1" : leading,
+                    "jet2" : subleading,
+                    "pt": (leading + subleading).pt,
+                    "eta": (leading + subleading).eta,
+                    "phi": (leading + subleading).phi,
+                    "mass": (leading + subleading).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        higgs_cand = ak.zip({
+                    "jet1" : leading_with_fsr,
+                    "jet2" : subleading_with_fsr,
+                    "pt": (leading_with_fsr + subleading_with_fsr).pt,
+                    "eta": (leading_with_fsr + subleading_with_fsr).eta,
+                    "phi": (leading_with_fsr + subleading_with_fsr).phi,
+                    "mass": (leading_with_fsr + subleading_with_fsr).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        
+        
+        # =================================================================================
+        #
+        # #                       Actual event selection starts here
+        #
+        # ---------------------------------------------------------------------------------
+        
+        
+        # Common global requirements in the Zll channel
+        # - valid for 2LH and 2LL
+        # - valid for any region, no matter if SR or CR
+        
+        # leppair and ll_cand have different dim, leppair contains lists,
+        # ll_cand only numbers on innermost dim (because already reduced above)
+        # therefore when evaluating ak.any with axis=-1,
+        # ll_cand will ALWAYS be true (a.k.a. for every event), as long as one event fulfils the criterion
+        # for leppair, there needs to be one per event, as expected
+       # print((leppair.lep1.pt>20))
+       # print((ll_cand.mass>75))
+       # print((higgs_cand.mass<250))
+       # print((njet>=2))
+        # inside any one can then only place stuff that has one more dim
+        
+        # related to individual leptons
+        req_global = ak.any((leppair.lep1.pt>20) & (leppair.lep2.pt>20) \
+                              # opposite charge
+                        & ((leppair.lep1.charge+leppair.lep2.charge)==0) \
+                        , axis=-1
+            )
+        # cands and global stuff
+        # note: V_pt > 60 as in AT, AN: 50 (don't confuse)
+        req_global = req_global \
+                   & (ll_cand.pt>60) \
+                   & (njet>=2) \
+                   & (higgs_cand.mass<250)
+        
+        
+        selection.add('global_selection',ak.to_numpy(req_global))
+        
+        
+        mask2e = req_global & (nele == 2)
+        mask2mu = req_global & (nmu == 2)
+        
+        #mask2lep = [ak.any(tup) for tup in zip(maskemu, mask2mu, mask2e)]
+        mask2lep = [ak.any(tup) for tup in zip(mask2mu, mask2e)]
+        
+        good_leptons = ak.mask(good_leptons,mask2lep)
+       
+        
+        #output['cutflow'][dataset]['selected Z pairs'] += ak.sum(ak.num(good_leptons)>0)
+        
+        selection.add('ee',ak.to_numpy(nele == 2))
+        selection.add('mumu',ak.to_numpy(nmu == 2))
+        
+        
+        #print(higgs_cand.type)
+        #print(ll_cand.type)
+        
+        # global already contains Vpt>60 as the lower bound
+        # global also has higgs_cand.mass<250
+        req_sr_Zll = (ll_cand.mass > 75) & (ll_cand.mass < 105) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4)
+        # flip H mass, otherwise same
+        req_cr_Zcc = (ll_cand.mass > 85) & (ll_cand.mass < 97) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & ~((higgs_cand.mass>=50) & (higgs_cand.mass<=200)) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4)
+        # Note: m_ll requirement not in AN, but in AT
+        req_cr_Z_LF = (ll_cand.mass > 75) & (ll_cand.mass < 105) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL<0.225) & (leading.btagDeepFlavCvB>0.4)
+        
+        req_cr_Z_HF = (ll_cand.mass > 85) & (ll_cand.mass < 97) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4)
+        
+        req_cr_t_tbar = ~((ll_cand.mass>0) & (ll_cand.mass<10)) & ~((ll_cand.mass>75) & (ll_cand.mass<120)) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4)
+        
+        req_sr_Zll_vpt_low  = req_global & req_sr_Zll & (ll_cand.pt<150)
+       # print(ll_cand.pt<150)
+       # print(ak.any(ll_cand.pt<150, axis=-1)
+       # print(req_sr_Zll_vpt_low)
+        req_sr_Zll_vpt_high = req_global & req_sr_Zll & (ll_cand.pt>150)
+       # print(ll_cand.pt>150)
+       # print(req_sr_Zll_vpt_high)
+       # print(len(req_sr_Zll_vpt_low))
+       # print(len(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high))
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low)))
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high)))
+        
+        req_cr_Zcc_vpt_low  = req_global & req_cr_Zcc & (ll_cand.pt<150)
+       # print(req_sr_Zll_vpt_low)
+        req_cr_Zcc_vpt_high = req_global & req_cr_Zcc & (ll_cand.pt>150)
+       # print(req_sr_Zll_vpt_high)
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low & req_sr_Zll_vpt_high)))
+        
+        req_cr_Z_LF_vpt_low  = req_global & req_cr_Z_LF & (ll_cand.pt<150)
+        req_cr_Z_LF_vpt_high = req_global & req_cr_Z_LF & (ll_cand.pt>150)
+        
+        req_cr_Z_HF_vpt_low  = req_global & req_cr_Z_HF & (ll_cand.pt<150)
+        req_cr_Z_HF_vpt_high = req_global & req_cr_Z_HF & (ll_cand.pt>150)
+        
+        req_cr_t_tbar_vpt_low  = req_global & req_cr_t_tbar & (ll_cand.pt<150)
+        req_cr_t_tbar_vpt_high = req_global & req_cr_t_tbar & (ll_cand.pt>150)
+        
+        
+        #prob not necessary
+        #selection.add('SR',ak.to_numpy(req_sr_Zll))
+        
+        selection.add('SR_2LL',ak.to_numpy(req_sr_Zll_vpt_low))
+        selection.add('SR_2LH',ak.to_numpy(req_sr_Zll_vpt_high))
+        selection.add('CR_Zcc_2LL',ak.to_numpy(req_cr_Zcc_vpt_low))
+        selection.add('CR_Zcc_2LH',ak.to_numpy(req_cr_Zcc_vpt_high))
+        selection.add('CR_Z_LF_2LL',ak.to_numpy(req_cr_Z_LF_vpt_low))
+        selection.add('CR_Z_LF_2LH',ak.to_numpy(req_cr_Z_LF_vpt_high))
+        selection.add('CR_Z_HF_2LL',ak.to_numpy(req_cr_Z_HF_vpt_low))
+        selection.add('CR_Z_HF_2LH',ak.to_numpy(req_cr_Z_HF_vpt_high))
+        selection.add('CR_t_tbar_2LL',ak.to_numpy(req_cr_t_tbar_vpt_low))
+        selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high))
+        
+        
+        
+        
+        
+        # =================================================================================
+        #
+        # #                    Calculate and store weights & factors
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # there is also nProcEvents, which might be related to nEvents by some factor
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc
+        # there are some more calculations related to weights, e.g.
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc#L115-L154
+        
+        # ToDo:
+        # [ ] LHEScaleWeight ??
+        # [ ] intWeight - is this only relevant when running over the post-processed samples, or already on top of Nano+AK15?
+        # [x] genWeight
+        # [ ] PrefireWeight - (for 2016+2017) see also:
+        #     https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2099-L2113
+        # [ ] weight_PU
+        # [ ] weight_ptEWK
+        # [(x)] Lep_SF - but I'm not sure about EOY / UL compatibility
+        # [ ] recoWReWeight
+        # [ ] WJetNLOWeight
+        # [ ] cTagWeight - later, also including up/down syst
+        # [ ] weight_mettrigSF
+        # [ ] weight_puid - not the same as _PU
+        # [ ] weight_subptEWKnnlo - find out what "SubGen" is
+        #
+        # [ ] LOtoNLOWeightBjetSplitEtabb
+        # [ ] WPtCorrFactor
+        # [ ] ZPtCorrFactor
+        
+        
+        
+        
+        # running over more than just the Double[] datasets, but still requiring the same trigger
+        # not sure if correct
+        if 'DoubleEG' in dataset or 'Electron' in dataset:
+            output['cutflow'][dataset]['trigger'] += ak.sum(trigger_ee)
+        elif 'Muon' in dataset :
+            output['cutflow'][dataset]['trigger'] += ak.sum(trigger_mm)
+            
+            
+        # Successively add another cut w.r.t. previous line, looks a bit like N-1 histograms
+        output['cutflow'][dataset]['jet selection'] += ak.sum(njet>=2)
+        output['cutflow'][dataset]['global selection'] += ak.sum(req_global)
+        output['cutflow'][dataset]['signal region'] += ak.sum(req_global & req_sr_Zll)
+        output['cutflow'][dataset]['signal region & ee or mumu'] += ak.sum(req_global & req_sr_Zll & ( ((nele == 2) & trigger_ee) | ((nmu == 2) & trigger_mm)))
+        output['cutflow'][dataset]['signal ee'] += ak.sum(req_global & req_sr_Zll & (nele == 2) & trigger_ee)
+        output['cutflow'][dataset]['signal mumu'] += ak.sum(req_global & req_sr_Zll & (nmu == 2) & trigger_mm)
+        
+
+        lepflav = ['ee','mumu']
+        reg = ['SR_2LL','SR_2LH',
+               'CR_Zcc_2LL','CR_Zcc_2LH',
+               'CR_Z_LF_2LL','CR_Z_LF_2LH',
+               'CR_Z_HF_2LL','CR_Z_HF_2LH',
+               'CR_t_tbar_2LL','CR_t_tbar_2LH']
+        
+        #print(possible_flavSplits)
+        list_weights = []
+        lists_of_vars = {}
+        names = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_mass', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_phi_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+        modes = ['low_ee', 'high_ee', 'low_mumu', 'high_mumu']
+        for name in names:
+            for mode in modes:
+                lists_of_vars[f'{name}_{mode}'] = []
+        '''
+        lists_of_vars = {'wei': [],
+                 'Higgs_mass':  [],
+                 'Higgs_pt': [],
+                 'Z_pt': [],
+                 'jjVptratio': [],
+                 'CvsL_max': [],
+                 'CvsL_min': [], 
+                 'CvsB_max': [],
+                 'CvsB_min': [],
+                 'pt_lead': [],
+                 'pt_sublead': [],
+                 'del_phi_jjV': [],
+                 'del_R_jj': [], 
+                 'del_eta_jj': [],
+                 'del_phi_ll': [], 
+                 'del_eta_ll': [],
+                 'del_phi_l2_subleading': [],
+                 'del_phi_l2_leading': []
+                 }
+        '''
+        #### write into histograms (i.e. write output)
+        for histname, h in output.items():
+            for s in possible_flavSplits:
+                dataset_renamed = dataset if s == 'already_split_sample' else dataset + s
+                for ch in lepflav:
+                    for r in reg:
+                        cut = selection.all('lepsel',
+                                            'jetsel',
+                                            'global_selection',
+                                            'metfilter',
+                                            'lumi',
+                                            r,
+                                            ch,
+                                            s,
+                                            'trigger_%s'%(ch))
+                        llcut = ll_cand[cut]
+                        # this next line is necessary if running with multiple possible ll candidates
+                        #llcut = llcut[:,0]
+
+                        lep1cut = llcut.lep1
+                        lep2cut = llcut.lep2
+                        #print(self._version)
+                        if not isRealData and not self._debug:
+                            #print('not data, not test')
+                            if ch == 'ee':
+                                lepsf = eleSFs(lep1cut, self._year, self._corr) * eleSFs(lep2cut, self._year, self._corr)
+                            elif ch == 'mumu':
+                                lepsf = muSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr)
+                            '''
+                            # This would be emu channel, which does not exist in the VHcc Zll case
+                            else:
+                                lepsf = np.where(lep1cut.lep_flav == 11,
+                                               eleSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr),
+                                               1.) \
+                                      * np.where(lep1cut.lep_flav == 13,
+                                               eleSFs(lep2cut, self._year, self._corr) * muSFs(lep1cut, self._year, self._corr),
+                                               1.)
+                           '''
+                        else : 
+                            #lepsf = weights.weight()[cut]
+                            # AS: if I understand correctly, this only works because in case of data, weights are identically 1 for every entry
+                            # otherwise this would double count the weights in a later step (where lepsf gets multiplied by the weights!)
+                            lepsf = ak.full_like(weights.weight()[cut], 1)
+                        #print(lepsf)
+                        # print(weights.weight()[cut]*lepsf)
+                        # print(lepsf)
+                        '''
+                        if self._export_array and not isRealData:
+                            if ch == 'ee' and r == 'SR_2LL' and s == '_cc':
+                                            eell_cand = ak.zip({
+                                                           "Higgs_mass" : higgs_cand['mass'][cut] * lepsf,
+                                                           #"jet2" : subleading_with_fsr,
+                                                           #"pt": (leading_with_fsr + subleading_with_fsr).pt,
+                                                           #"eta": (leading_with_fsr + subleading_with_fsr).eta,
+                                                           #"phi": (leading_with_fsr + subleading_with_fsr).phi,
+                                                           #"mass": (leading_with_fsr + subleading_with_fsr).mass,
+                                                            })
+                                            print(eell_cand)
+                                            '''
+                        if 'leading_jetflav_' in histname and 'sub' not in histname:
+                            #print(dir(leading))
+                            #print(h.axes)
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(leading[histname.replace('leading_jetflav_','')],
+                                                   cut) for l in names if l in dir(leading)}
+                            #print(fields)
+                            #sys.exit()
+                            if isRealData:
+                                flavor = ak.zeros_like(normalize(leading['pt'],cut))
+                            else:
+                                flavor = normalize(leading.hadronFlavour,cut)
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   flav = flavor,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)  
+                        elif 'subleading_jetflav_' in histname:
+                            #print(dir(subleading))
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(subleading[histname.replace('subleading_jetflav_','')],
+                                                   cut) for l in names if l in dir(subleading)}
+                            if isRealData:
+                                flavor = ak.zeros_like(normalize(subleading['pt'],cut))
+                            else:
+                                flavor = normalize(subleading.hadronFlavour,cut)
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   flav = flavor,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)  
+                        elif 'lep1_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(lep1cut[histname.replace('lep1_','')]),
+                                                      np.nan) for l in names if l in dir(lep1cut)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)
+                        elif 'lep2_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(lep2cut[histname.replace('lep2_','')]),
+                                                      np.nan) for l in names if l in dir(lep2cut)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)
+                        #elif 'MET_' in histname:
+                        #    fields = {l: normalize(events.MET[histname.replace('MET_','')],
+                        #                           cut) for l in names if l in dir(events.MET)}
+                        #    h.fill(
+                        #           datasetSplit = dataset_renamed,
+                        #           lepflav = ch,
+                        #           region = r,
+                        #           **fields,
+                        #           weight = weights.weight()[cut] * lepsf) 
+                        elif 'll_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(llcut[histname.replace('ll_','')]),
+                                                      np.nan) for l in names if l in dir(llcut)}
+                            #print(max(llcut['pt']))
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf) 
+                        elif 'jj_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(higgs_cand[histname.replace('jj_','')],
+                                                   cut) for l in names if l in dir(higgs_cand)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf) 
+                        else:
+                            output['nj'].fill(
+                                              datasetSplit = dataset_renamed,
+                                              lepflav = ch,
+                                              region = r,
+                                              nj = normalize(ak.num(jet_conditions),cut),
+                                              weight = weights.weight()[cut]*lepsf)
+                            # check?
+                            output['nAddJets302p5_puid'].fill(
+                                                              datasetSplit = dataset_renamed,
+                                                              lepflav = ch,
+                                                              region = r,
+                                                              nAddJets302p5_puid = normalize(ak.where((ak.num(jet_conditions) > 2),
+                                                                                                    (ak.num(jet_conditions)-2),
+                                                                                                    (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                    ),
+                                                                                             cut),
+                                                              weight = weights.weight()[cut]*lepsf)
+                            # check?
+                            output['nAddJetsFSRsub302p5_puid'].fill(
+                                                                    datasetSplit = dataset_renamed,
+                                                                    lepflav = ch,
+                                                                    region = r,
+                                                                    nAddJetsFSRsub302p5_puid = normalize(ak.where((ak.where((ak.num(jet_conditions) > 2),
+                                                                                                                            (ak.num(jet_conditions)-2),
+                                                                                                                            (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                                            )
+                                                                                                                     -ak.num((~jet_conditions) & (fsr_conditions))) > 0, 
+                                                                                                                 (ak.where((ak.num(jet_conditions) > 2),
+                                                                                                                            (ak.num(jet_conditions)-2),
+                                                                                                                            (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                                            )
+                                                                                                                     -ak.num((~jet_conditions) & (fsr_conditions))),
+                                                                                                                 (ak.zeros_like(ak.num(jet_conditions)))),
+                                                                                                         cut),
+                                                                    weight = weights.weight()[cut]*lepsf)
+                            #if not isRealData:
+                            #    output['weight_full'].fill(
+                            #                               datasetSplit = dataset_renamed,
+                            #                               lepflav = ch,
+                            #                               region = r,
+                            #                               weight_full = weights.weight()[cut]*lepsf)
+                            #    output['genweight'].fill(
+                            #                             datasetSplit = dataset_renamed,
+                            #                             lepflav = ch,
+                            #                             region = r,
+                            #                             genWeight = events.genWeight[cut])
+                            #    output['sign_genweight'].fill(
+                            #                                  datasetSplit = dataset_renamed,
+                            #                                  lepflav = ch,
+                            #                                  region = r,
+                            #                                  genWeight_by_abs = (events.genWeight/abs(events.genWeight))[cut])
+                            output['jjVPtRatio'].fill(
+                                                      datasetSplit = dataset_renamed,
+                                                      lepflav = ch,
+                                                      region = r,
+                                                      jjVPtRatio = (normalize(higgs_cand['pt'],
+                                                                              cut) / ak.fill_none(flatten(llcut['pt']),
+                                                                                                  np.nan)),
+                                                      weight = weights.weight()[cut] * lepsf)
+                            if self._export_array and not isRealData:
+                                import pandas as pd
+                                #output['array'][dataset]['weight'] += processor.column_accumulator(
+                                #                                        ak.to_numpy(weights.weight()[cut] * lepsf)
+                                #                                      )
+                                
+                                list_weights.append(ak.to_numpy(weights.weight()[cut] * lepsf))
+                                
+                                roi = ['SR_2LL','SR_2LH']
+                                lepflav_chosen = ['ee','mumu']
+                                names_dict = {'wei': weights.weight()[cut] * lepsf,
+                                              'Higgs_mass':  higgs_cand['mass'][cut] * lepsf,
+                                              'Higgs_pt': higgs_cand['pt'][cut] * lepsf,
+                                              'Z_mass': ll_cand['mass'][cut] * lepsf,
+                                              'Z_pt': ll_cand['pt'][cut] * lepsf,
+                                              'jjVptratio': (higgs_cand['pt'][cut] * lepsf)/ (ll_cand['pt'][cut] * lepsf),
+                                              'CvsL_max': leading_with_fsr['jet1']['btagDeepFlavCvL'][cut] * lepsf,
+                                              'CvsL_min': subleading_with_fsr['jet1']['btagDeepFlavCvL'][cut] * lepsf, 
+                                              'CvsB_max': leading_with_fsr['jet1']['btagDeepFlavCvB'][cut] * lepsf,
+                                              'CvsB_min': subleading_with_fsr['jet1']['btagDeepFlavCvB'][cut] * lepsf,
+                                              'pt_lead': leading_with_fsr['jet1']['pt'][cut] * lepsf,
+                                              'pt_sublead': subleading_with_fsr['jet1']['pt'][cut] * lepsf,
+                                              'del_phi_jjV': np.abs((higgs_cand[cut] * lepsf).delta_phi((ll_cand[cut] * lepsf))),
+                                              'del_R_jj': np.abs((higgs_cand['jet1'][cut] * lepsf).delta_r((higgs_cand['jet2'][cut] * lepsf))), 
+                                              'del_eta_jj': np.abs((higgs_cand['jet1']['eta'][cut] * lepsf) - ((higgs_cand['jet2']['eta'][cut] * lepsf))),
+                                              'del_phi_jj': np.abs((higgs_cand['jet1'][cut] * lepsf).delta_phi((higgs_cand['jet2'][cut] * lepsf))),
+                                              'del_phi_ll': np.abs((ll_cand['lep1'][cut] * lepsf).delta_phi((ll_cand['lep2'][cut] * lepsf))), 
+                                              'del_eta_ll': np.abs((ll_cand['lep1']['eta'][cut] * lepsf) - ((ll_cand['lep2']['eta'][cut] * lepsf))),
+                                              'del_phi_l2_subleading': np.abs((ll_cand['lep2'][cut] * lepsf).delta_phi((higgs_cand['jet1'][cut] * lepsf))),
+                                              'del_phi_l2_leading': np.abs((ll_cand['lep2'][cut] * lepsf).delta_phi((higgs_cand['jet2'][cut] * lepsf)))
+                                               }
+                                if ch in lepflav_chosen and r in roi:
+                                    if ch == 'ee':
+                                        if r == 'SR_2LL':
+                                            for var_name, var_value in names_dict.items():
+                                                lists_of_vars[f'{var_name}_low_ee'].append(ak.to_numpy(var_value))
+                                                #output['array'][dataset][f'{var_name}_low_ee'] += processor.column_accumulator(
+                                                #                        ak.to_numpy(var_value)
+                                                #                     )
+                                        elif r == 'SR_2LH':
+                                            for var_name, var_value in names_dict.items():
+                                                lists_of_vars[f'{var_name}_high_ee'].append(ak.to_numpy(var_value))
+                                                #output['array'][dataset][f'{var_name}_high_ee'] += processor.column_accumulator(
+                                                #                        ak.to_numpy(var_value)
+                                                #                      )
+                                    elif ch == 'mumu':
+                                        if r == 'SR_2LL':
+                                            for var_name, var_value in names_dict.items():
+                                                lists_of_vars[f'{var_name}_low_mumu'].append(ak.to_numpy(var_value))
+                                                #output['array'][dataset][f'{var_name}_low_mumu'] += processor.column_accumulator(
+                                                #                        ak.to_numpy(var_value)
+                                                #                      )
+                                        elif r == 'SR_2LH':
+                                            for var_name, var_value in names_dict.items():
+                                                lists_of_vars[f'{var_name}_high_mumu'].append(ak.to_numpy(var_value))
+                                                #output['array'][dataset][f'{var_name}_high_mumu'] += processor.column_accumulator(
+                                                #                        ak.to_numpy(var_value)
+                                                #                      )
+                                            
+                                
+                                
+                                               ### + regression, kinfit ???
+        list_weights = np.array([item for sublist in list_weights for item in sublist])
+        print(list_weights)
+        #print(lists_of_vars)
+        for v_name in lists_of_vars.keys():
+            lists_of_vars[v_name] = np.array([item for sublist in lists_of_vars[v_name] for item in sublist])
+        print(lists_of_vars)
+        if 'ZH' in dataset:
+            ttyp = 'signal_03'
+        else:
+            ttyp = 'back_03'
+        folder_save = f'condor_{ttyp}'
+        if not os.path.exists(f"./{folder_save}"):
+            os.mkdir(f"./{folder_save}")
+        if not os.path.exists(f"./{folder_save}/{dataset}"):
+            os.mkdir(f"./{folder_save}/{dataset}")
+        if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"):
+            os.mkdir(f"./{folder_save}/{dataset}/{filename}")
+        try:
+            df_weights = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv')
+        except FileNotFoundError:
+            df_weights = pd.DataFrame([], columns = ['weights'])
+        df_wei = pd.DataFrame([], columns = ['weights'])
+        df_wei['weights'] = list_weights
+        df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True)
+        df_wei.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights.csv', sep=',', encoding='utf-8', index=False)
+        df_weights_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv', sep=',', encoding='utf-8', index=False)
+        try:
+            df_else_everything = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv')
+        except FileNotFoundError:
+            df_else_everything = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()])
+        df_else = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()])
+        #print(df_else)
+        for var in lists_of_vars.keys():
+            df_else[var] = pd.Series(lists_of_vars[var])
+        df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True)
+        
+        df_else.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights.csv', sep=',', encoding='utf-8', index=False)
+        df_else_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv', sep=',', encoding='utf-8', index=False)                                              
+                    
+        return {dataset: output}
+
+    def postprocess(self, accumulator):
+        #print(accumulator)
+        return accumulator
diff --git a/Zll_process_newHist_pandas_small_update_isolation.py b/Zll_process_newHist_pandas_small_update_isolation.py
new file mode 100644
index 0000000..11de771
--- /dev/null
+++ b/Zll_process_newHist_pandas_small_update_isolation.py
@@ -0,0 +1,1687 @@
+import csv
+from curses import meta
+from dataclasses import dataclass
+import gzip
+import pickle, os, sys, mplhep as hep, numpy as np
+from select import select
+
+import json
+
+#from coffea import hist, processor # ToDo: move to the better hist
+from coffea import processor # ToDo: move to the better hist
+import hist
+from hist import Hist
+from coffea.nanoevents.methods import vector
+import awkward as ak
+from VHcc.utils.correction import jec,muSFs,eleSFs,init_corr
+from coffea.lumi_tools import LumiMask
+from coffea.analysis_tools import Weights, PackedSelection
+from functools import partial
+# import numba
+from VHcc.helpers.util import reduce_and, reduce_or, nano_mask_or, get_ht, normalize, make_p4
+import particle
+from hepunits import GeV
+
+def empty_column_accumulator():
+    #return processor.column_accumulator(np.array([],dtype=object))
+    return processor.column_accumulator(np.array([],dtype=np.float64))
+def array_accumulator():
+    return processor.defaultdict_accumulator(empty_column_accumulator)
+
+def mT(obj1,obj2):
+    return np.sqrt(2.*obj1.pt*obj2.pt*(1.-np.cos(obj1.phi-obj2.phi)))
+def flatten(ar): # flatten awkward into a 1d array to hist
+    return ak.flatten(ar, axis=None)
+def normalize(val, cut):
+    if cut is None:
+        ar = ak.to_numpy(ak.fill_none(val, np.nan))
+        return ar
+    else:
+        ar = ak.to_numpy(ak.fill_none(val[cut], np.nan))
+        return ar
+
+def read_json(path):
+    f = open(path)
+    data = json.load(f)
+    return data
+
+def dataset_name_to_number(dataset, year):
+    samples_path = 'src/VHcc/metadata/sample_info_' + year + '_reversed'
+
+    samples = read_json(samples_path+'.json')
+    
+    return samples[dataset]['type'], samples[dataset]['doJetFlavorSplit']
+
+def dataset_categories(year):
+    map_path = 'src/VHcc/metadata/mergemap_' + year + '_Zll'
+    
+    samples = read_json(map_path+'.json').values()
+    all_datasets = [item for sublist in samples for item in sublist]
+    
+    return all_datasets
+
+def get_info_dict(year):
+    with open(f'src/VHcc/metadata/sample_info_{year}.json') as si:
+        info = json.load(si)
+        info_dict={}
+        for obj in info:
+            #print(obj)
+            info_dict[obj]=info[obj]['name']
+        return info_dict
+
+class NanoProcessor(processor.ProcessorABC):
+    def __init__(self, cfg):
+        self.cfg = cfg
+        self._year = self.cfg.dataset["year"]
+        self._campaign = self.cfg.dataset["campaign"]
+
+        self._version=self.cfg.userconfig['version'] # only because the new runner etc. needs that, not used later
+        self._export_array = True # if 'test' in self._version else False
+        self._debug = False #True
+        
+        # paths from table 1 and 2 of the AN_2020_235
+        
+        # l l
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3230-L3337
+        self._mumu_hlt = {
+            '2016': [
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL',
+                'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ'
+            ],
+            '2017': [
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1
+            ],
+            '2018': [
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL',
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ',
+                'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 but this is the only used one in 2018?!
+                #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1
+            ],
+        }   
+    
+        self._ee_hlt = {
+            '2016': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ'
+            ],
+            '2017': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL',
+                #'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' # not in VHccAnalysis code
+            ],
+            '2018': [
+                'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL'
+            ],
+        }  
+        
+        '''
+        # l nu
+        self._munu_hlt = {
+            '2016': [
+                'IsoMu24',
+                'IsoTkMu24'
+            ],
+            '2017': [
+                'IsoMu24',
+                'IsoMu27'
+            ],
+            '2018': [
+                'IsoMu24',
+                'IsoMu27'
+            ],
+        }   
+    
+        self._enu_hlt = {
+            '2016': [
+                'Ele27_eta2p1_WPTight_Gsf'
+            ],
+            '2017': [
+                'Ele32_WPTight_Gsf_L1DoubleEG',
+                'Ele32_WPTight_Gsf'
+            ],
+            '2018': [
+                'Ele32_WPTight_Gsf_L1DoubleEG',
+                'Ele32_WPTight_Gsf'#allowMissingBranch=1
+            ],
+        }  
+        
+        # nu nu
+        self._nunu_hlt = {
+            '2016': [
+                'PFMET110_PFMHT110_IDTight',
+                #'PFMET110_PFMHT120_IDTight', # found in hltbranches_2016.txt but not in AN, maybe redundant?
+                'PFMET170_NoiseCleaned',#allowMissingBranch=1
+                'PFMET170_BeamHaloCleaned',#allowMissingBranch=1
+                'PFMET170_HBHECleaned'
+            ],
+            '2017': [
+                'PFMET110_PFMHT110_IDTight',
+                'PFMET120_PFMHT120_IDTight',
+                'PFMET120_PFMHT120_IDTight_PFHT60',#allowMissingBranch=1
+                'PFMETTypeOne120_PFMHT120_IDTight'
+            ],
+            '2018': [
+                'PFMET110_PFMHT110_IDTight',
+                'PFMET120_PFMHT120_IDTight',
+                'PFMET120_PFMHT120_IDTight_PFHT60'#allowMissingBranch=1
+            ],
+        } 
+        
+        '''
+        
+        # differences between UL and EOY
+        # see https://twiki.cern.ch/twiki/bin/view/CMS/MissingETOptionalFiltersRun2
+        # also look at sec. 3.7.2
+        self._met_filters = {
+            '2016': {
+                'data': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    'eeBadScFilter',
+                ],
+                'mc': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'eeBadScFilter', # not suggested in EOY MC
+                ],
+            },
+            '2017': {
+                "data": [
+                    "goodVertices",
+                    "globalSuperTightHalo2016Filter",
+                    "HBHENoiseFilter",
+                    "HBHENoiseIsoFilter",
+                    "EcalDeadCellTriggerPrimitiveFilter",
+                    "BadPFMuonFilter",
+                    "BadPFMuonDzFilter",
+                    "hfNoisyHitsFilter",
+                    "eeBadScFilter",
+                    "ecalBadCalibFilter",
+                ],
+                "mc": [
+                    "goodVertices",
+                    "globalSuperTightHalo2016Filter",
+                    "HBHENoiseFilter",
+                    "HBHENoiseIsoFilter",
+                    "EcalDeadCellTriggerPrimitiveFilter",
+                    "BadPFMuonFilter",
+                    "BadPFMuonDzFilter",
+                    "hfNoisyHitsFilter",
+                    "eeBadScFilter",
+                    "ecalBadCalibFilter",
+                ],
+            },
+            '2018': {
+                'data': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'hfNoisyHitsFilter', # not in EOY
+                    'eeBadScFilter',
+                    'ecalBadCalibFilterV2',
+                ],
+                'mc': [
+                    'goodVertices',
+                    'globalSuperTightHalo2016Filter',
+                    'HBHENoiseFilter',
+                    'HBHENoiseIsoFilter',
+                    'EcalDeadCellTriggerPrimitiveFilter',
+                    'BadPFMuonFilter',
+                    #'BadPFMuonDzFilter', # not in EOY
+                    #'hfNoisyHitsFilter', # not in EOY
+                    #'eeBadScFilter', # not suggested in EOY MC
+                    'ecalBadCalibFilterV2',
+                ],
+            },
+        }
+        
+        # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/crab/crab_all.py#L33-36
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions16/13TeV/ReReco/Final/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions17/13TeV/ReReco/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt' 
+        #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions18/13TeV/ReReco/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt'
+        # downloaded.
+        '''
+        self._lumiMasks = {
+            '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'),
+            '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt'),
+            '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt')
+        }
+        '''
+        self._lumiMasks = {
+            '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'),
+            '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt'),
+            '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt')
+        }
+        
+        self._corr = init_corr(self._year)
+        
+        # Axes: Cat - what it is, a type of something, described with words
+        #       Bin - how much of something, numerical things
+        #
+        #   --> Some axes are already connected to specific objetcs, or to the event
+        #   --> Others are "building-blocks" that can be reused multiple times
+        
+        list_of_datasets = dataset_categories(self._year)
+        #print(list_of_datasets)
+        #sys.exit()
+        # Define axes
+        # Should read axes from NanoAOD config / metadata
+        #dataset_axis = hist.Cat("dataset", "Primary dataset")
+        dataset_axis = hist.axis.StrCategory([],      name="dataset", label="Primary dataset", growth=True)
+        # split V+jets sample & VZ signal, this is per event
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2276
+        #datasetSplit_axis = hist.Cat("datasetSplit", "Dataset split by flav", list_of_datasets)
+        datasetSplit_axis = hist.axis.StrCategory(list_of_datasets, name="datasetSplit", label="Dataset split by flav")
+        
+        # use hadronFlavour, necessary when applying btag scale factors (that depend on flavour)
+        # this one will be done per jet, can have values 0, 4, 5
+        #flav_axis = hist.Bin("flav", r"hadronFlavour",[0,1,4,5,6])
+        flav_axis = hist.axis.Variable([0,1,4,5,6], name="flav", label="hadronFlavour")
+        
+        #lepflav_axis = hist.Cat("lepflav",['ee','mumu'])
+        lepflav_axis = hist.axis.StrCategory(['ee','mumu'], name="lepflav", label="Lepton flav")
+        
+        regions = ['SR_2LL','SR_2LH',
+                   'CR_Zcc_2LL','CR_Zcc_2LH',
+                   'CR_Z_LF_2LL','CR_Z_LF_2LH',
+                   'CR_Z_HF_2LL','CR_Z_HF_2LH',
+                   'CR_t_tbar_2LL','CR_t_tbar_2LH']
+        #region_axis = hist.Cat("region",regions)
+        region_axis = hist.axis.StrCategory(regions, name="region", label="Region")
+        
+        # Events
+        njet_axis  = hist.axis.Regular(13, -.5, 12.5, name="nj", label="N jets") #hist.Bin("nj",  r"N jets", 13, -.5, 12.5)
+        
+        nAddJets_axis  = hist.axis.Regular(11, -.5, 10.5, name="nAddJets302p5_puid", label="N additional jets")
+        #hist.Bin("nAddJets302p5_puid",  r"N additional jets", 11, -.5, 10.5)
+        nAddJets_FSRsub_axis  = hist.axis.Regular(11, -.5, 10.5, name="nAddJetsFSRsub302p5_puid", label="N additional jets (FSR subtracted)")
+        #hist.Bin("nAddJetsFSRsub302p5_puid",  r"N additional jets (FSR subtracted)", 11, -.5, 10.5)
+        
+        #nbjet_axis = hist.Bin("nbj", r"N b jets",    [0,1,2,3,4,5])            
+        #ncjet_axis = hist.Bin("ncj", r"N c jets",    [0,1,2,3,4,5])
+        # kinematic variables       
+        pt_axis   = hist.axis.Regular(50, 0, 300, name="pt", label=r"$p_{T}$ [GeV]")
+        #hist.Bin("pt",   r" $p_{T}$ [GeV]", 50, 0, 300)
+        eta_axis  = hist.axis.Regular(25, -2.5, 2.5, name="eta", label=r"$\eta$")
+        #hist.Bin("eta",  r" $\eta$", 25, -2.5, 2.5)
+        phi_axis  = hist.axis.Regular(30, -3, 3, name="phi", label=r"$\phi$")
+        #hist.Bin("phi",  r" $\phi$", 30, -3, 3)
+        mass_axis = hist.axis.Regular(50, 0, 300, name="mass", label=r"$m$ [GeV]")
+        #hist.Bin("mass", r" $m$ [GeV]", 50, 0, 300)
+        mt_axis = hist.axis.Regular(30, 0, 300, name="mt", label=r"$m_{T}$ [GeV]")
+        #hist.Bin("mt", r" $m_{T}$ [GeV]", 30, 0, 300)
+        dr_axis = hist.axis.Regular(20, 0, 5, name="dr", label=r"$\Delta$R")
+        #hist.Bin("dr","$\Delta$R",20,0,5)
+        
+        # some more variables to check, which enter BDT
+        # need to revisit this later, because high Vpt and low Vpt can have different binning
+        jjVPtRatio_axis = hist.axis.Regular(15, 0, 2, name="jjVPtRatio", label=r"$p_{T}(jj) / $p_{T}(V)$ [GeV]")
+        #hist.Bin("jjVPtRatio",r"$p_{T}(jj) / $p_{T}(V)$ [GeV]",15,0,2)
+        
+        
+        #dphi_V_H_axis = hist.Bin("dphi_V_H","$\Delta\Phi(V, H)$",20,0,3.2)
+        # jet jet
+        #dr_j1_j2_axis = hist.Bin("dr_j1_j2","$\Delta R(j1,j2)$",20,0,5)
+        # jet jet
+        #dphi_j1_j2_axis = hist.Bin("dphi_j1_j2","$\Delta\Phi(j1,j2)$",15,-3.2,3.2)
+        #deta_j1_j2_axis = hist.Bin("deta_j1_j2","$\Delta\eta(j1,j2)$",15,0,3)
+        # lepton lepton
+        #dphi_l1_l2_axis = hist.Bin("dphi_l1_l2","$\Delta\Phi(l1,l2)$",15,0,3.2)
+        #deta_l1_l2_axis = hist.Bin("eta_l1_l2","$\Delta\eta(l1,l2)$",15,0,2.6)
+        # jet lepton
+        #dphi_j1_l1_axis = hist.Bin("dphi_j1_l1","$\Delta\Phi(j1,l1)$",15,0,3.2)
+        #dphi_j2_l1_axis = hist.Bin("dphi_j2_l1","$\Delta\Phi(j2,l1)$",15,0,3.2)
+        #dphi_j1_l2_axis = hist.Bin("dphi_j1_l2","$\Delta\Phi(j1,l2)$",15,0,3.2)
+        #dphi_j2_l2_axis = hist.Bin("dphi_j2_l2","$\Delta\Phi(j2,l2)$",15,0,3.2)
+        
+        # ToDo: several other variables can only be stored after kinfit
+        # e.g. here https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/python/kinfitter.py
+        # or https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L4670-L4995
+        
+        # weights are interesting as well
+        weight_axis = hist.axis.Regular(100, -5.001, 5.001, name="weight_full", label="weight_full")
+        #hist.Bin("weight_full","weight_full",100,-5.001, 5.001)
+        genweight_axis = hist.axis.Regular(100, -0.001, 0.001, name="genWeight", label="genWeight")
+        #hist.Bin("genWeight","genWeight",100,-0.001, 0.001)
+        sign_genweight_axis = hist.axis.Regular(100, -1.001, 1.001, name="genWeight_by_abs", label="genWeight/abs(genWeight)")
+        #hist.Bin("genWeight_by_abs","genWeight/abs(genWeight)",100,-1.001,1.001)
+        
+        
+        # MET vars
+        #signi_axis = hist.Bin("significance", r"MET $\sigma$",20,0,10)
+        #covXX_axis = hist.Bin("covXX",r"MET covXX",20,0,10)
+        #covXY_axis = hist.Bin("covXY",r"MET covXY",20,0,10)
+        #covYY_axis = hist.Bin("covYY",r"MET covYY",20,0,10)
+        #sumEt_axis = hist.Bin("sumEt", r" MET sumEt", 50, 0, 300)
+        
+        # ToDo: switch to this
+        # axis.StrCategory([], name='region', growth=True),
+        #disc_list = [ 'btagDeepCvL', 'btagDeepCvB','btagDeepFlavCvB','btagDeepFlavCvL']#,'particleNetAK4_CvL','particleNetAK4_CvB']
+        # As far as I can tell, we only need DeepFlav currently
+        ### In all of the older stuff use:
+        #disc_list = ['btagDeepFlavC','btagDeepFlavB','btagDeepFlavCvL','btagDeepFlavCvB']
+        ### With new stuff UL I use, use:
+        disc_list = ['btagDeepFlavCvL','btagDeepFlavCvB']
+        btag_axes = []
+        for d in disc_list:
+            # technically, -1 values are possible, but probably unlikely to matter much after event selection
+            btag_axes.append(hist.axis.Regular(20, 0, 1, name=d, label=d)
+                #hist.Bin(d, d , 20, 0, 1)
+            )  
+        #h = (
+        #    Hist.new.Reg(10, -5, 5, overflow=False, underflow=False, name="A")
+        #    .Bool(name="B")
+        #    .Var(range(10), name="C")
+        #    .Int(-5, 5, overflow=False, underflow=False, name="D")
+        #    .IntCat(range(10), name="E")
+        #    .StrCat(["T", "F"], name="F")
+        #    .Double()
+        #)
+        #print(type(dataset_axis))
+        #print(type(lepflav_axis))
+        #print(type(flav_axis))
+        #print(type(njet_axis))
+        #print(type(hist.storage.Weight()))
+        #testHistA = Hist(dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, njet_axis, hist.storage.Weight())    
+        #testHist = Hist(
+        #            #dataset_axis,
+        #            #datasetSplit_axis,
+        #            #lepflav_axis,
+        #            #region_axis,
+        #            njet_axis,
+        #            hist.storage.Weight()
+        #        )    
+        _hist_event_dict = {
+                'nj'                       : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       njet_axis, hist.storage.Weight()),
+                'nAddJets302p5_puid'       : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       nAddJets_axis, hist.storage.Weight()),
+                'nAddJetsFSRsub302p5_puid' : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       nAddJets_FSRsub_axis, hist.storage.Weight()),
+               # 'weight_full'              : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        weight_axis, hist.storage.Weight()),
+               # 'genweight'                : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        genweight_axis, hist.storage.Weight()),
+               # 'sign_genweight'           : Hist(datasetSplit_axis,
+               #                                        lepflav_axis,
+               #                                        region_axis,
+               #                                        sign_genweight_axis, hist.storage.Weight()),
+                'jjVPtRatio'               : Hist(datasetSplit_axis,
+                                                       lepflav_axis,
+                                                       region_axis,
+                                                       jjVPtRatio_axis, hist.storage.Weight())
+            
+                #'dphi_V_H'                 : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dphi_V_H_axis)
+                #'dr_j1_j2'                 : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dr_j1_j2_axis)
+                #'dphi_j1_j2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_j2_axis)
+                #'deta_j1_j2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, deta_j1_j2_axis)
+                #'dphi_l1_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_l1_l2_axis)
+                #'dphi_j1_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_l2_axis)
+                #'dphi_j2_l2'               : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j2_l2_axis)
+                
+                #'sampleFlavSplit'  : Hist( dataset_axis,  lepflav_axis, region_axis, sampleFlavSplit_axis),
+                #'nbj' : Hist( dataset_axis, lepflav_axis, region_axis, nbjet_axis),
+                #'ncj' : Hist( dataset_axis, lepflav_axis, region_axis, ncjet_axis),
+                #'hj_dr'  : Hist( dataset_axis, lepflav_axis, region_axis, dr_axis),
+                #'MET_sumEt' : Hist( dataset_axis, lepflav_axis, region_axis, sumEt_axis),
+                #'MET_significance' : Hist( dataset_axis, lepflav_axis, region_axis, signi_axis),
+                #'MET_covXX' : Hist( dataset_axis, lepflav_axis, region_axis, covXX_axis),
+                #'MET_covXY' : Hist( dataset_axis, lepflav_axis, region_axis, covXY_axis),
+                #'MET_covYY' : Hist( dataset_axis, lepflav_axis, region_axis, covYY_axis),
+                #'MET_phi' : Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'MET_pt' : Hist( dataset_axis, lepflav_axis, region_axis, pt_axis),
+                #'mT1' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'mT2' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'mTh':Hist( dataset_axis, lepflav_axis, region_axis, mt_axis),
+                #'dphi_lep1':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'dphi_lep2':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+                #'dphi_ll':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis),
+            }
+        
+        # jets will be ordered by DeepJet (which is DeepFlav for historical reasons)
+        objects=['leading_jetflav','subleading_jetflav','lep1','lep2','ll','jj']
+        
+        for i in objects:
+            # distinguish between jets and other objects, as the structure for jets contains additional flavour axis
+            if 'jet' in i: 
+                _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis,
+                                                         lepflav_axis,
+                                                         region_axis,
+                                                         flav_axis,
+                                                         pt_axis, hist.storage.Weight())
+                _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          flav_axis,
+                                                          eta_axis, hist.storage.Weight())
+                _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          flav_axis,
+                                                          phi_axis, hist.storage.Weight())
+                _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis,
+                                                           lepflav_axis,
+                                                           region_axis,
+                                                           flav_axis,
+                                                           mass_axis, hist.storage.Weight())
+            else:
+                _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis,
+                                                         lepflav_axis,
+                                                         region_axis,
+                                                         pt_axis, hist.storage.Weight())
+                _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          eta_axis, hist.storage.Weight())
+                _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis,
+                                                          lepflav_axis,
+                                                          region_axis,
+                                                          phi_axis, hist.storage.Weight())
+                _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis,
+                                                           lepflav_axis,
+                                                           region_axis,
+                                                           mass_axis, hist.storage.Weight())
+        
+        # more information on the discriminators is stored for the first two jets,
+        # ordered by DeepJet CvL discriminator and called "leading" and "subleading"
+        for disc, axis in zip(disc_list,btag_axes):
+            _hist_event_dict["leading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis,
+                                                                       lepflav_axis,
+                                                                       region_axis,
+                                                                       flav_axis,
+                                                                       axis, hist.storage.Weight())
+            _hist_event_dict["subleading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis,
+                                                                          lepflav_axis,
+                                                                          region_axis,
+                                                                          flav_axis,
+                                                                          axis, hist.storage.Weight())
+            
+        self.event_hists = list(_hist_event_dict.keys())
+        
+        # this can be used to not only store histograms, but also features on a per-event basis (arrays)
+        if self._export_array:
+            _hist_event_dict['array'] = processor.defaultdict_accumulator(array_accumulator)
+        #self._accumulator = processor.dict_accumulator(
+        #    {**_hist_event_dict,   
+        #     #'cutflow': processor.defaultdict_accumulator(
+        #     #    partial(processor.defaultdict_accumulator, int))
+        #    })
+        #self._accumulator['sumw'] = processor.defaultdict_accumulator(float)
+        
+        #self._accumulator = processor.dict_accumulator(
+        #    {
+        #        observable: Hist.Hist(var_axis, name="Counts", storage="Weight")
+        #        for observable, var_axis in axis.items()
+        #        if observable != "dataset"
+        #    }
+        #)
+        #self._accumulator["cutflow"] = processor.defaultdict_accumulator(
+        #    partial(processor.defaultdict_accumulator, int)
+        #)
+        #self._accumulator["sumw"] = 0
+        
+        self.make_output = lambda: {
+            "cutflow": processor.defaultdict_accumulator(
+                partial(processor.defaultdict_accumulator, int)
+            ),
+            "sumw": 0,
+            **_hist_event_dict
+        }
+
+
+    @property
+    def accumulator(self):
+        return self._accumulator
+
+    def process(self, events):
+        #output = self.accumulator #.identity()
+        output = self.make_output()
+        dataset = events.metadata['dataset']
+        start = events.metadata['entrystart']
+        stop = events.metadata['entrystop']
+        output_location_list = []
+        filename = events.metadata['filename'].split('/')[-1].strip('.root')
+        #print(dataset)
+        # Q: could there be MC that does not have this attribute? Or is it always the case?
+        isRealData = not hasattr(events, "genWeight")
+        
+        # Done (externally): map from the lengthy dataset (path) to a more readable name
+        # Keep the long name only for data, because it contains the Run info (necessary to apply corrections)
+        if isRealData:
+            info_dict = get_info_dict(self._year)
+            dataset_long = dataset
+            dictname = dataset[1:].split('/')[0]
+            dataset = info_dict[dictname]
+        print(dataset)    
+        sample_type, doFlavSplit = dataset_name_to_number(dataset, self._year)
+        # length of events is used so many times later on, probably useful to just save it here and then refer to that
+        nEvents = len(events)
+        print('Number of events: ', nEvents)
+        if 'ZH' in dataset:
+            ttyp = 'signal_04_mid'
+        else:
+            ttyp = 'back_04_mid'
+        folder_save = f'condor_{ttyp}'
+        if not os.path.exists(f"./{folder_save}"):
+            os.mkdir(f"./{folder_save}")
+        if not os.path.exists(f"./{folder_save}/{dataset}"):
+            os.mkdir(f"./{folder_save}/{dataset}")
+        if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"):
+            os.mkdir(f"./{folder_save}/{dataset}/{filename}")
+        with open(f"./{folder_save}/event_nr.txt", "a") as myfile:
+            myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + "  " + '\n')
+        
+        # As far as I understand, this looks like a neat way to give selections a name,
+        # while internally, there are boolean arrays for all events
+        selection = PackedSelection()
+        
+        
+        # this is either counting events in data with weight 1, or weighted (MC)
+        if isRealData:
+            output['sumw'] += nEvents
+        else:
+            # instead of taking the weights themselves, the sign is used:
+            # https://cms-talk.web.cern.ch/t/huge-event-weights-in-dy-powhegminnlo/8718/7
+            # although I initially had the same concerns as those raised in the thread,
+            # if not only the sign is different, but also the absolute values between events
+            # somehow it seems to average out, although I don't see why this is guaranteed
+            # must have to do with "LO without interference" where the values are indeed same
+            # and if they are not same, the differences are consired to be negligible
+            output['sumw'] += ak.sum(events.genWeight/abs(events.genWeight))
+            
+            
+        req_lumi=np.ones(nEvents, dtype='bool')
+        if isRealData: 
+            req_lumi=self._lumiMasks[self._year](events.run, events.luminosityBlock)
+        selection.add('lumi',ak.to_numpy(req_lumi))
+        del req_lumi
+        
+        
+        # AS: sort of the same thing as above, but now per entry
+        weights = Weights(nEvents, storeIndividual=True)
+        if isRealData:
+            weights.add('genweight',np.ones(nEvents))
+        else:
+            weights.add('genweight',events.genWeight/abs(events.genWeight))
+            # weights.add('puweight', compiled['2017_pileupweight'](events.Pileup.nPU))
+            
+            
+        ##############
+        if isRealData:
+            output['cutflow'][dataset]['all']  += nEvents
+            output['cutflow'][dataset]['all (weight 1)']  += nEvents
+        else:
+            output['cutflow'][dataset]['all']  += ak.sum(events.genWeight/abs(events.genWeight))
+            output['cutflow'][dataset]['all (weight 1)']  += nEvents
+            
+        
+        #trigger_met = np.zeros(nEvents, dtype='bool')
+
+        trigger_ee = np.zeros(nEvents, dtype='bool')
+        trigger_mm = np.zeros(nEvents, dtype='bool')
+
+        #trigger_e = np.zeros(nEvents, dtype='bool')
+        #trigger_m = np.zeros(nEvents, dtype='bool')
+        
+        #for t in self._nunu_hlt[self._year]:
+        #    # so that already seems to be the check for whether the path exists in the file or not
+        #    if t in events.HLT.fields:
+        #        trigger_met = trigger_met | events.HLT[t]
+
+        for t in self._mumu_hlt[self._year]:
+            if t in events.HLT.fields:
+                trigger_mm = trigger_mm | events.HLT[t]
+
+        for t in self._ee_hlt[self._year]:
+            if t in events.HLT.fields:
+                trigger_ee = trigger_ee | events.HLT[t]
+
+        #for t in self._munu_hlt[self._year]:
+        #    if t in events.HLT.fields:
+        #        trigger_m = trigger_m | events.HLT[t]
+
+        #for t in self._emu_hlt[self._year]:
+        #    if t in events.HLT.fields:
+        #        trigger_e = trigger_e | events.HLT[t]
+        
+        
+        selection.add('trigger_ee', ak.to_numpy(trigger_ee))
+        selection.add('trigger_mumu', ak.to_numpy(trigger_mm))
+        
+        
+        # apart from the comments above about EOY/UL, should be fine
+        metfilter = np.ones(nEvents, dtype='bool')
+        for flag in self._met_filters[self._year]['data' if isRealData else 'mc']:
+            metfilter &= np.array(events.Flag[flag])
+        selection.add('metfilter', metfilter)
+        del metfilter
+        
+        
+        
+        # Not strictly necessary for Zll
+        met = ak.zip({
+                    "pt":  events.MET.pt,
+                    "phi": events.MET.phi,
+                    "energy": events.MET.sumEt,
+                    }, with_name="PtEtaPhiMLorentzVector"
+                )
+        
+        
+        
+        split_by_flav = False
+        sampleFlavSplit = np.zeros(nEvents)
+        possible_flavSplits = ['already_split_sample']
+        selection.add('already_split_sample',sampleFlavSplit == 0)
+        if not isRealData and not self._debug:
+            if doFlavSplit == '1' and not (int(sample_type) >= 27 and int(sample_type) <= 39):
+                split_by_flav = True
+                # uses the same naming scheme as AT, although udbsg is counterintuitive (b? [sic!])
+                possible_flavSplits = ['_cc','_bb','_bc','_cl','_bl','_udbsg']
+                # =================================================================================
+                #
+                # #                       Split V+jets BG by flavour, via GenJet
+                #
+                # ---------------------------------------------------------------------------------
+                # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2228
+                gen_jet = events.GenJet
+
+                cGenJetTot = ak.sum((gen_jet.hadronFlavour == 4) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1)
+                bGenJetTot = ak.sum((gen_jet.hadronFlavour == 5) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1)
+
+                tag_cc = cGenJetTot >= 2
+                tag_bb = bGenJetTot >= 2
+                tag_bc = (bGenJetTot == 1) & (cGenJetTot == 1)
+                tag_cl = (cGenJetTot == 1) & (bGenJetTot == 0)
+                tag_bl = (bGenJetTot == 1) & (cGenJetTot == 0)
+                tag_ll = (cGenJetTot == 0) & (bGenJetTot == 0)
+                
+                sampleFlavSplit = 1 * tag_cc  +  2 * tag_bb  +  3 * tag_bc  +  4 * tag_cl  +  5 * tag_bl  +  6 * tag_ll 
+                selection.add('_cc',sampleFlavSplit == 1)
+                selection.add('_bb',sampleFlavSplit == 2)
+                selection.add('_bc',sampleFlavSplit == 3)
+                selection.add('_cl',sampleFlavSplit == 4)
+                selection.add('_bl',sampleFlavSplit == 5)
+                selection.add('_udbsg',sampleFlavSplit == 6) # tbf I don't know why it contains b
+            
+            #elif dataset in ['WZTo1L1Nu2Q', 'ZZTo2L2Q', 'ZZTo2Q2Nu']: # VZ signal datasets
+            elif int(sample_type) in [32,36,37]: # VZ signal datasets
+                split_by_flav = True
+                possible_flavSplits = ['cc','bb','ll']
+                # =================================================================================
+                #
+                # #                       Split VZ signal by flavour, via GenPart
+                #
+                # ---------------------------------------------------------------------------------
+                # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2229-L2264
+                gen_part = events.GenPart
+                
+                
+                Z_decay_mothers_A = (abs(gen_part.pdgId) == 23) & (gen_part.hasFlags('isLastCopy'))
+                
+                Z_decays = gen_part[Z_decay_mothers_A]
+                output['cutflow'][dataset]['GenPart VZ signal'] += ak.sum(Z_decay_mothers_A)
+                
+                n_b_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 5, axis=-1), axis=-1)
+                n_c_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 4, axis=-1), axis=-1)
+                
+                
+                
+                VZ_cc = (n_c_from_Z >= 2)
+                VZ_bb = (n_b_from_Z >= 2)
+                VZ_others = (~VZ_cc) & (~VZ_bb)
+                # 1, 2 and 3 identical to what was done in AnalysisTools! Do not confuse with BTV / hadron / parton flavour...
+                sampleFlavSplit = 1 * VZ_cc  +  2 * VZ_bb  +  3 * VZ_others
+                
+                #print(sampleFlavSplit.type)
+                
+                selection.add('cc',sampleFlavSplit == 1)
+                selection.add('bb',sampleFlavSplit == 2)
+                selection.add('ll',sampleFlavSplit == 3)
+            
+            elif int(sample_type) in [27,28,29,30,31,33,34,35,38,39]: 
+                possible_flavSplits = ['ll']
+                sampleFlavSplit = sampleFlavSplit + 3
+                selection.add('ll',sampleFlavSplit == 3)
+                split_by_flav = True
+                
+            # this is how it looked in AT for comparison:
+            '''
+            else if( cursample->doJetFlavorSplit
+                     && ( mInt("sampleIndex")==27 || mInt("sampleIndex")==28
+                      || mInt("sampleIndex")==29 || mInt("sampleIndex")==30
+                      || mInt("sampleIndex")==31 || mInt("sampleIndex")==33
+                      || mInt("sampleIndex")==34 || mInt("sampleIndex")==35
+                      || mInt("sampleIndex")==38 || mInt("sampleIndex")==39
+                      )
+                     ){
+                        *in["sampleIndex"] = mInt("sampleIndex")*100 + 3;
+            '''
+        
+        
+        
+        
+        
+        # =================================================================================
+        #
+        # #                       Reconstruct and preselect leptons
+        #
+        # ---------------------------------------------------------------------------------
+        
+        
+        # Adopt from https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440
+        # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/VHccProducer.py#L345-389
+        
+        # ## Muon cuts
+        ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2
+        #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)]
+        event_mu = events.Muon
+        # looseId >= 1 or looseId seems to be the same...
+        musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25)) #(event_mu.looseId >= 1) (event_mu.mvaId >= 3)
+        # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L
+        event_mu = event_mu[musel]
+        event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)]
+        event_mu["lep_flav"] = 13*event_mu.charge
+        event_mu= ak.pad_none(event_mu,2,axis=1)
+        nmu = ak.sum(musel,axis=1)
+        # ToDo: PtCorrGeoFit
+        
+        # ## Electron cuts
+        ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2
+        #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)]
+        event_e = events.Electron
+        elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25))
+        # but 30GeV and WP80 for 1L
+        event_e = event_e[elesel]
+        # something I saw in a recent presentation, and also in AT code:
+        # https://indico.desy.de/event/34473/contributions/122201/attachments/76587/98753/RTG_Meeting_01_09_22.pdf
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/VHccAnalysis/PlotWithVarial/ZllHccLowPt.py#L256-L257
+        # is to require "good electrons", which means excluding some region (eta),
+        # I guess it has sth to do with transition between barrel / endcap?
+        event_e = event_e[(abs(event_e.eta) > 1.5660) | (abs(event_e.eta) < 1.4442)]
+        event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)]
+        event_e["lep_flav"] = 11*event_e.charge
+        event_e = ak.pad_none(event_e,2,axis=1)
+        nele = ak.sum(elesel,axis=1)
+        # sorting after selecting should be faster (less computations on average)
+        
+        # for this channel (Zll / 2L)
+        selection.add('lepsel',ak.to_numpy((nele==2)|(nmu==2)))
+        
+        print(event_e)
+        print("Elecs") 
+        print(event_mu)
+        print("Mus")
+        print(ak.concatenate([ event_e, event_mu], axis=1))
+        print("Concat")
+        #### build lepton pair(s)
+        good_leptons = ak.with_name(
+                ak.concatenate([ event_e, event_mu], axis=1),
+                "PtEtaPhiMCandidate", )
+        good_leptons = good_leptons[ak.argsort(good_leptons.pt, axis=1,ascending=False)]
+        leppair = ak.combinations(
+                good_leptons,
+                n=2,
+                replacement=False,
+                axis=-1,
+                fields=["lep1", "lep2"],
+            )
+        #charged_constr = ((leppair.lep1['lep_flav'] + leppair.lep2['lep_flav']) == 0 )
+        #leppair = leppair[charged_constr]
+        ll_cand = ak.zip({
+                    "lep1" : leppair.lep1,
+                    "lep2" : leppair.lep2,
+                    "pt": (leppair.lep1+leppair.lep2).pt,
+                    "eta": (leppair.lep1+leppair.lep2).eta,
+                    "phi": (leppair.lep1+leppair.lep2).phi,
+                    "mass": (leppair.lep1+leppair.lep2).mass,
+                    }, with_name="PtEtaPhiMLorentzVector"
+                )
+        # probably there needs to be a cross-check that we don't include more than we want here,
+        # I know there is the option to truncate the array if more than 1 is found
+        # --> clip = True
+        ll_cand = ak.pad_none(ll_cand,1,axis=1)
+
+        print(ll_cand)
+        
+        # there seem to be multiple ways to get the "one" ll_cand of interest
+        # - closest to Z-mass [makes sense]
+        #   I think others use this
+        # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases]
+        #   https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440    
+        ZMASS = particle.Particle.findall("Z0")[0].mass / GeV
+
+        if (ak.count(ll_cand.pt)>0):
+            ll_cand  = ll_cand[ak.argsort(ll_cand.pt, axis=1,ascending=False)]
+        #if (ak.count(ll_cand.pt)>0):
+        #    ll_cand  = ll_cand[ak.argsort(np.abs(ll_cand.mass-ZMASS), axis=1,ascending=True)]
+            # try the second option here
+        # NOTE: Comment out to debug stuff
+        ll_cand = ll_cand[:, 0]
+            
+        print(ll_cand)
+        print()
+        
+        # =================================================================================
+        #
+        # #                       Reconstruct and preselect leptons gen level
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # ## Muon cuts
+        generator = events.GenPart
+        #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)]
+        
+        event_mu_gen  = generator[np.abs(generator.pdgId) == 13]
+        # looseId >= 1 or looseId seems to be the same...
+        musel_gen = ((event_mu_gen.pt > 20) & (abs(event_mu_gen.eta) < 2.4) & (event_mu_gen.status == 1)) 
+        event_mu_gen = event_mu_gen[musel_gen]
+        event_mu_gen = event_mu_gen[ak.argsort(event_mu_gen.pt, axis=1, ascending=False)]
+        event_mu_gen["lep_flav"] = event_mu_gen.pdgId
+        event_mu_gen["charge"] = event_mu_gen.pdgId/13
+        event_mu_gen= ak.pad_none(event_mu_gen,2,axis=1)
+        nmu_gen = ak.sum(musel_gen,axis=1)
+        # ToDo: PtCorrGeoFit
+        
+        # ## Electron cuts
+        ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2
+        #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)]
+        event_e_gen  = generator[np.abs(generator.pdgId) == 11]
+        elesel_gen = ((event_e_gen.pt > 20) & (abs(event_e_gen.eta) < 2.4) & (event_e_gen.status == 1)) 
+        # but 30GeV and WP80 for 1L
+        event_e_gen = event_e_gen[elesel_gen]
+        event_e_gen = event_e_gen[(abs(event_e_gen.eta) > 1.5660) | (abs(event_e_gen.eta) < 1.4442)]
+        event_e_gen = event_e_gen[ak.argsort(event_e_gen.pt, axis=1,ascending=False)]
+        event_e_gen["lep_flav"] = event_e_gen.pdgId
+        event_e_gen["charge"] = event_e_gen.pdgId/11
+        event_e_gen = ak.pad_none(event_e_gen,2,axis=1)
+        nele_gen = ak.sum(elesel_gen,axis=1)
+        # sorting after selecting should be faster (less computations on average)
+        
+        # for this channel (Zll / 2L)
+        selection.add('lepsel_gen',ak.to_numpy((nele_gen==2)|(nmu_gen==2)))
+        
+        print(event_e_gen)
+        print("Elecs") 
+        print(event_mu_gen)
+        print("Mus")
+        print(ak.concatenate([ event_e_gen, event_mu_gen], axis=1))
+        print("Concat")
+        #### build lepton pair(s)
+        good_leptons_gen = ak.with_name(
+                ak.concatenate([ event_e_gen, event_mu_gen], axis=1),
+                "PtEtaPhiMCandidate", )
+        good_leptons_gen = good_leptons_gen[ak.argsort(good_leptons_gen.pt, axis=1,ascending=False)]
+        leppair_gen = ak.combinations(
+                good_leptons_gen,
+                n=2,
+                replacement=False,
+                axis=1,
+                fields=["lep1", "lep2"],
+            )
+        #charged_constr = ((leppair.lep1['lep_flav'] + leppair.lep2['lep_flav']) == 0 )
+        #leppair = leppair[charged_constr]
+        ll_cand_gen = ak.zip({
+                    "lep1" : leppair_gen.lep1,
+                    "lep2" : leppair_gen.lep2,
+                    "pt": (leppair_gen.lep1+leppair_gen.lep2).pt,
+                    "eta": (leppair_gen.lep1+leppair_gen.lep2).eta,
+                    "phi": (leppair_gen.lep1+leppair_gen.lep2).phi,
+                    "mass": (leppair_gen.lep1+leppair_gen.lep2).mass,
+                    }, with_name="PtEtaPhiMLorentzVector"
+                )
+        # probably there needs to be a cross-check that we don't include more than we want here,
+        # I know there is the option to truncate the array if more than 1 is found
+        # --> clip = True
+        ll_cand_gen = ak.pad_none(ll_cand_gen,1,axis=1)
+
+        print(ll_cand_gen)
+        
+        # there seem to be multiple ways to get the "one" ll_cand of interest
+        # - closest to Z-mass [makes sense]
+        #   I think others use this
+        # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases]
+        #   https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440    
+        ZMASS = particle.Particle.findall("Z0")[0].mass / GeV
+
+        if (ak.count(ll_cand_gen.pt)>0):
+            ll_cand_gen  = ll_cand_gen[ak.argsort(ll_cand_gen.pt, axis=1,ascending=False)]
+        #if (ak.count(ll_cand.pt)>0):
+        #    ll_cand  = ll_cand[ak.argsort(np.abs(ll_cand.mass-ZMASS), axis=1,ascending=True)]
+            # try the second option here
+        # NOTE: Comment out to debug stuff
+        ll_cand_gen = ll_cand_gen[:, 0]
+            
+        print(ll_cand_gen)
+        print()
+           
+        # =================================================================================
+        #
+        # #                       Reconstruct and preselect jets
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # Apply correction:
+        if isRealData:
+            #print(dataset_long)
+            jets =  jec(events,events.Jet,dataset_long,self._year,self._corr)
+        else:
+            jets =  jec(events,events.Jet,dataset,self._year,self._corr)
+        #jets =  events.Jet
+        
+        # This was necessary for the FSR code
+        #jets = jets.mask[ak.num(jets) > 2]
+        
+        
+        
+        # For EOY: recalculate CvL & CvB here, because the branch does not exist in older files
+        # adapted from PostProcessor
+        def deepflavcvsltag(jet):
+            btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB)
+            return ak.where((jet.btagDeepFlavB >= 0.) & (jet.btagDeepFlavB < 1.) & (jet.btagDeepFlavC >= 0.) & (btagDeepFlavL >= 0.),
+                            jet.btagDeepFlavC/(1.-jet.btagDeepFlavB),
+                            (-1.) * ak.ones_like(jet.btagDeepFlavB))
+        
+        def deepflavcvsbtag(jet):
+            btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB)
+            return ak.where((jet.btagDeepFlavB > 0.) & (jet.btagDeepFlavC > 0.) & (btagDeepFlavL >= 0.),
+                            jet.btagDeepFlavC/(jet.btagDeepFlavC+jet.btagDeepFlavB),
+                            (-1.) * ak.ones_like(jet.btagDeepFlavB))
+        
+        # Alternative ways:
+        # - depending on the Nano version, there might already be bTagDeepFlavCvL available
+        # - one could instead use DeepCSV via bTagDeepCvL
+        # - not necessarily use CvL, other combination possible ( CvB | pt | BDT? )
+        
+        #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets)
+        #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets)
+        jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)]
+
+        
+        # Jets are considered only if the following identification conditions hold, as mentioned in AN
+        # - Here is some documentation related to puId and jetId:
+        #     https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJetID
+        #     https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID
+        jet_conditions = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
+                     | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2)
+        # Count how many jets exist that pass this selection
+        njet = ak.sum(jet_conditions,axis=1)
+        selection.add('jetsel',ak.to_numpy(njet>=2))
+        
+        
+        # =================================================================================
+        #
+        # #                                 FSR recovery
+        #
+        # ---------------------------------------------------------------------------------
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L841-L956
+        
+        # FSR jets are selected with slightly different criteria
+        fsr_conditions = (abs(jets.eta) < 3) & (jets.pt > 20) \
+                        & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2)
+        # Take the first two jets that pass the criteria and check the remaining ones,
+        # as well as potentially others, to get FSR jets:
+        pick2 = jets[ak.pad_none(ak.local_index(jets, 1)[jet_conditions], 2)[:, :2]]
+        others = jets[ak.concatenate([ak.pad_none(ak.local_index(jets, 1)[(jet_conditions) & (fsr_conditions)], 2)[:, 2:], 
+                                    ak.local_index(jets, 1)[(~jet_conditions) & (fsr_conditions)]
+                                   ], axis=1)]
+        
+
+        def find_fsr(leading, subleading, others, threshold=0.8):
+            mval1, (a1, b) = leading.metric_table(others, return_combinations=True)
+            mval2, (a2, b) = subleading.metric_table(others, return_combinations=True)
+
+            def res(mval, out):
+                order = ak.argsort(mval, axis=-1)
+                return out[order], mval[order]
+
+            out1, metric1 =  res(mval1, b)
+            out2, metric2 =  res(mval2, b)
+
+            out1 = out1.mask[(metric1 <= threshold) & (metric1 < metric2)]
+            out2 = out2.mask[(metric2 <= threshold) & (metric2 < metric1)]
+            #out2 = out2.mask[(metric1 <= threshold) & (metric2 < metric1)]
+            return out1[:, 0, ...], out2[:, 0, ...]
+
+        
+        missing = ~(ak.is_none(pick2[:, 0]) | ak.is_none(pick2[:, 1]))
+        pick2 = pick2.mask[missing]
+        others = others.mask[missing]
+
+        
+        leading, subleading = pick2[:, 0], pick2[:, 1]
+        fsr_leading, fsr_subleading = find_fsr(leading, subleading, others, threshold=0.8)
+
+        #print(leading.pt)
+        #print((leading + fsr_leading.sum()).pt)
+        
+        # To explicitly check that adding FSR does indeed have an effect
+        #print(ak.sum((leading + fsr_leading.sum()).pt != leading.pt))
+        
+        #print(leading.type)
+        
+        # Collect the (sub-)leading jets and their respective FSR jets in a new 4-vector
+        leading_with_fsr = ak.zip({
+                    "jet1" : leading,
+                    "jet2" : fsr_leading.sum(),
+                    "pt": (leading + fsr_leading.sum()).pt,
+                    "eta": (leading + fsr_leading.sum()).eta,
+                    "phi": (leading + fsr_leading.sum()).phi,
+                    "mass": (leading + fsr_leading.sum()).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        subleading_with_fsr = ak.zip({
+                    "jet1" : subleading,
+                    "jet2" : fsr_subleading.sum(),
+                    "pt": (subleading + fsr_subleading.sum()).pt,
+                    "eta": (subleading + fsr_subleading.sum()).eta,
+                    "phi": (subleading + fsr_subleading.sum()).phi,
+                    "mass": (subleading + fsr_subleading.sum()).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        
+        # (Maybe) one could calculate the angle between FSR & the "main" jet they correspond to
+        # - this would be correlated with the mass of the decaying p. via the dead-cone effect,
+        # - could be a discriminating variable at the event level.
+        
+        # =================================================================================
+        #
+        # #                       Build Higgs candidate w/ or w/o FSR
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # Build 4-vector from leading + subleading jets, with or without FSR
+        higgs_cand_no_fsr = ak.zip({
+                    "jet1" : leading,
+                    "jet2" : subleading,
+                    "pt": (leading + subleading).pt,
+                    "eta": (leading + subleading).eta,
+                    "phi": (leading + subleading).phi,
+                    "mass": (leading + subleading).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        higgs_cand = ak.zip({
+                    "jet1" : leading_with_fsr,
+                    "jet2" : subleading_with_fsr,
+                    "pt": (leading_with_fsr + subleading_with_fsr).pt,
+                    "eta": (leading_with_fsr + subleading_with_fsr).eta,
+                    "phi": (leading_with_fsr + subleading_with_fsr).phi,
+                    "mass": (leading_with_fsr + subleading_with_fsr).mass,
+                },with_name="PtEtaPhiMLorentzVector",)
+        
+        
+        
+        # =================================================================================
+        #
+        # #                       Actual event selection starts here
+        #
+        # ---------------------------------------------------------------------------------
+        
+        
+        # Common global requirements in the Zll channel
+        # - valid for 2LH and 2LL
+        # - valid for any region, no matter if SR or CR
+        
+        # leppair and ll_cand have different dim, leppair contains lists,
+        # ll_cand only numbers on innermost dim (because already reduced above)
+        # therefore when evaluating ak.any with axis=-1,
+        # ll_cand will ALWAYS be true (a.k.a. for every event), as long as one event fulfils the criterion
+        # for leppair, there needs to be one per event, as expected
+       # print((leppair.lep1.pt>20))
+       # print((ll_cand.mass>75))
+       # print((higgs_cand.mass<250))
+       # print((njet>=2))
+        # inside any one can then only place stuff that has one more dim
+        
+        # related to individual leptons
+        req_global = ak.any((leppair.lep1.pt>20) & (leppair.lep2.pt>20) \
+                              # opposite charge
+                        & ((leppair.lep1.charge+leppair.lep2.charge)==0) \
+                        , axis=-1
+            )
+        # cands and global stuff
+        # note: V_pt > 60 as in AT, AN: 50 (don't confuse)
+        req_global = req_global \
+                   & (ll_cand.pt>60) \
+                   & (njet>=2) \
+                   & (higgs_cand.mass<250)
+        
+        
+        selection.add('global_selection',ak.to_numpy(req_global))
+        
+        
+        mask2e = req_global & (nele == 2)
+        mask2mu = req_global & (nmu == 2)
+        
+        #mask2lep = [ak.any(tup) for tup in zip(maskemu, mask2mu, mask2e)]
+        mask2lep = [ak.any(tup) for tup in zip(mask2mu, mask2e)]
+        
+        good_leptons = ak.mask(good_leptons,mask2lep)
+       
+        
+        #output['cutflow'][dataset]['selected Z pairs'] += ak.sum(ak.num(good_leptons)>0)
+        
+        selection.add('ee',ak.to_numpy(nele == 2))
+        selection.add('mumu',ak.to_numpy(nmu == 2))
+        
+        
+        #print(higgs_cand.type)
+        #print(ll_cand.type)
+        
+        # global already contains Vpt>60 as the lower bound
+        # global also has higgs_cand.mass<250
+        req_sr_Zll = (ll_cand.mass > 75) & (ll_cand.mass < 105) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4)
+        # flip H mass, otherwise same
+        req_cr_Zcc = (ll_cand.mass > 85) & (ll_cand.mass < 97) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & ~((higgs_cand.mass>=50) & (higgs_cand.mass<=200)) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4)
+        # Note: m_ll requirement not in AN, but in AT
+        req_cr_Z_LF = (ll_cand.mass > 75) & (ll_cand.mass < 105) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL<0.225) & (leading.btagDeepFlavCvB>0.4)
+        
+        req_cr_Z_HF = (ll_cand.mass > 85) & (ll_cand.mass < 97) \
+                            & (higgs_cand.delta_phi(ll_cand)>2.5) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4)
+        
+        req_cr_t_tbar = ~((ll_cand.mass>0) & (ll_cand.mass<10)) & ~((ll_cand.mass>75) & (ll_cand.mass<120)) \
+                            & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \
+                            & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4)
+        
+        req_sr_Zll_vpt_low  = req_global & req_sr_Zll & (ll_cand.pt<150)
+       # print(ll_cand.pt<150)
+       # print(ak.any(ll_cand.pt<150, axis=-1)
+       # print(req_sr_Zll_vpt_low)
+        req_sr_Zll_vpt_high = req_global & req_sr_Zll & (ll_cand.pt>150)
+       # print(ll_cand.pt>150)
+       # print(req_sr_Zll_vpt_high)
+       # print(len(req_sr_Zll_vpt_low))
+       # print(len(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high))
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low)))
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high)))
+        
+        req_cr_Zcc_vpt_low  = req_global & req_cr_Zcc & (ll_cand.pt<150) & (ll_cand.pt>50)
+       # print(req_sr_Zll_vpt_low)
+        req_cr_Zcc_vpt_high = req_global & req_cr_Zcc & (ll_cand.pt>150)
+       # print(req_sr_Zll_vpt_high)
+       # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low & req_sr_Zll_vpt_high)))
+        
+        req_cr_Z_LF_vpt_low  = req_global & req_cr_Z_LF & (ll_cand.pt<150)
+        req_cr_Z_LF_vpt_high = req_global & req_cr_Z_LF & (ll_cand.pt>150)
+        
+        req_cr_Z_HF_vpt_low  = req_global & req_cr_Z_HF & (ll_cand.pt<150)
+        req_cr_Z_HF_vpt_high = req_global & req_cr_Z_HF & (ll_cand.pt>150)
+        
+        req_cr_t_tbar_vpt_low  = req_global & req_cr_t_tbar & (ll_cand.pt<150)
+        req_cr_t_tbar_vpt_high = req_global & req_cr_t_tbar & (ll_cand.pt>150)
+        
+        
+        #prob not necessary
+        #selection.add('SR',ak.to_numpy(req_sr_Zll))
+        
+        selection.add('SR_2LL',ak.to_numpy(req_sr_Zll_vpt_low))
+        selection.add('SR_2LH',ak.to_numpy(req_sr_Zll_vpt_high))
+        selection.add('CR_Zcc_2LL',ak.to_numpy(req_cr_Zcc_vpt_low))
+        selection.add('CR_Zcc_2LH',ak.to_numpy(req_cr_Zcc_vpt_high))
+        selection.add('CR_Z_LF_2LL',ak.to_numpy(req_cr_Z_LF_vpt_low))
+        selection.add('CR_Z_LF_2LH',ak.to_numpy(req_cr_Z_LF_vpt_high))
+        selection.add('CR_Z_HF_2LL',ak.to_numpy(req_cr_Z_HF_vpt_low))
+        selection.add('CR_Z_HF_2LH',ak.to_numpy(req_cr_Z_HF_vpt_high))
+        selection.add('CR_t_tbar_2LL',ak.to_numpy(req_cr_t_tbar_vpt_low))
+        selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high))
+        
+        
+        
+        
+        
+        # =================================================================================
+        #
+        # #                    Calculate and store weights & factors
+        #
+        # ---------------------------------------------------------------------------------
+        
+        # there is also nProcEvents, which might be related to nEvents by some factor
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc
+        # there are some more calculations related to weights, e.g.
+        # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc#L115-L154
+        
+        # ToDo:
+        # [ ] LHEScaleWeight ??
+        # [ ] intWeight - is this only relevant when running over the post-processed samples, or already on top of Nano+AK15?
+        # [x] genWeight
+        # [ ] PrefireWeight - (for 2016+2017) see also:
+        #     https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2099-L2113
+        # [ ] weight_PU
+        # [ ] weight_ptEWK
+        # [(x)] Lep_SF - but I'm not sure about EOY / UL compatibility
+        # [ ] recoWReWeight
+        # [ ] WJetNLOWeight
+        # [ ] cTagWeight - later, also including up/down syst
+        # [ ] weight_mettrigSF
+        # [ ] weight_puid - not the same as _PU
+        # [ ] weight_subptEWKnnlo - find out what "SubGen" is
+        #
+        # [ ] LOtoNLOWeightBjetSplitEtabb
+        # [ ] WPtCorrFactor
+        # [ ] ZPtCorrFactor
+        
+        
+        
+        
+        # running over more than just the Double[] datasets, but still requiring the same trigger
+        # not sure if correct
+        if 'DoubleEG' in dataset or 'Electron' in dataset:
+            output['cutflow'][dataset]['trigger'] += ak.sum(trigger_ee)
+        elif 'Muon' in dataset :
+            output['cutflow'][dataset]['trigger'] += ak.sum(trigger_mm)
+            
+            
+        # Successively add another cut w.r.t. previous line, looks a bit like N-1 histograms
+        output['cutflow'][dataset]['jet selection'] += ak.sum(njet>=2)
+        output['cutflow'][dataset]['global selection'] += ak.sum(req_global)
+        output['cutflow'][dataset]['signal region'] += ak.sum(req_global & req_sr_Zll)
+        output['cutflow'][dataset]['signal region & ee or mumu'] += ak.sum(req_global & req_sr_Zll & ( ((nele == 2) & trigger_ee) | ((nmu == 2) & trigger_mm)))
+        output['cutflow'][dataset]['signal ee'] += ak.sum(req_global & req_sr_Zll & (nele == 2) & trigger_ee)
+        output['cutflow'][dataset]['signal mumu'] += ak.sum(req_global & req_sr_Zll & (nmu == 2) & trigger_mm)
+        
+
+        lepflav = ['ee','mumu']
+        reg = ['SR_2LL','SR_2LH',
+               'CR_Zcc_2LL','CR_Zcc_2LH',
+               'CR_Z_LF_2LL','CR_Z_LF_2LH',
+               'CR_Z_HF_2LL','CR_Z_HF_2LH',
+               'CR_t_tbar_2LL','CR_t_tbar_2LH']
+        
+        #print(possible_flavSplits)
+        list_weights = []
+        lists_of_vars = {}
+        names = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_mass', 'Z_mass_gen', 'Z_pt_gen', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_phi_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+        modes = ['low_ee', 'high_ee', 'low_mumu', 'high_mumu']
+        for name in names:
+            for mode in modes:
+                lists_of_vars[f'{name}_{mode}'] = []
+        '''
+        lists_of_vars = {'wei': [],
+                 'Higgs_mass':  [],
+                 'Higgs_pt': [],
+                 'Z_pt': [],
+                 'jjVptratio': [],
+                 'CvsL_max': [],
+                 'CvsL_min': [], 
+                 'CvsB_max': [],
+                 'CvsB_min': [],
+                 'pt_lead': [],
+                 'pt_sublead': [],
+                 'del_phi_jjV': [],
+                 'del_R_jj': [], 
+                 'del_eta_jj': [],
+                 'del_phi_ll': [], 
+                 'del_eta_ll': [],
+                 'del_phi_l2_subleading': [],
+                 'del_phi_l2_leading': []
+                 }
+        '''
+        #### write into histograms (i.e. write output)
+        for histname, h in output.items():
+            for s in possible_flavSplits:
+                dataset_renamed = dataset if s == 'already_split_sample' else dataset + s
+                for ch in lepflav:
+                    for r in reg:
+                        cut = selection.all('lepsel',
+                                            'jetsel',
+                                            'global_selection',
+                                            'metfilter',
+                                            'lumi',
+                                            r,
+                                            ch,
+                                            s,
+                                            'trigger_%s'%(ch))
+                        llcut = ll_cand[cut]
+                        # this next line is necessary if running with multiple possible ll candidates
+                        #llcut = llcut[:,0]
+
+                        lep1cut = llcut.lep1
+                        lep2cut = llcut.lep2
+                        #print(self._version)
+                        if not isRealData and not self._debug:
+                            #print('not data, not test')
+                            if ch == 'ee':
+                                lepsf = eleSFs(lep1cut, self._year, self._corr) * eleSFs(lep2cut, self._year, self._corr)
+                            elif ch == 'mumu':
+                                lepsf = muSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr)
+                            '''
+                            # This would be emu channel, which does not exist in the VHcc Zll case
+                            else:
+                                lepsf = np.where(lep1cut.lep_flav == 11,
+                                               eleSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr),
+                                               1.) \
+                                      * np.where(lep1cut.lep_flav == 13,
+                                               eleSFs(lep2cut, self._year, self._corr) * muSFs(lep1cut, self._year, self._corr),
+                                               1.)
+                           '''
+                        else : 
+                            #lepsf = weights.weight()[cut]
+                            # AS: if I understand correctly, this only works because in case of data, weights are identically 1 for every entry
+                            # otherwise this would double count the weights in a later step (where lepsf gets multiplied by the weights!)
+                            lepsf = ak.full_like(weights.weight()[cut], 1)
+                        #print(lepsf)
+                        # print(weights.weight()[cut]*lepsf)
+                        # print(lepsf)
+                        '''
+                        if self._export_array and not isRealData:
+                            if ch == 'ee' and r == 'SR_2LL' and s == '_cc':
+                                            eell_cand = ak.zip({
+                                                           "Higgs_mass" : higgs_cand['mass'][cut] * lepsf,
+                                                           #"jet2" : subleading_with_fsr,
+                                                           #"pt": (leading_with_fsr + subleading_with_fsr).pt,
+                                                           #"eta": (leading_with_fsr + subleading_with_fsr).eta,
+                                                           #"phi": (leading_with_fsr + subleading_with_fsr).phi,
+                                                           #"mass": (leading_with_fsr + subleading_with_fsr).mass,
+                                                            })
+                                            print(eell_cand)
+                                            '''
+                        if 'leading_jetflav_' in histname and 'sub' not in histname:
+                            #print(dir(leading))
+                            #print(h.axes)
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(leading[histname.replace('leading_jetflav_','')],
+                                                   cut) for l in names if l in dir(leading)}
+                            #print(fields)
+                            #sys.exit()
+                            if isRealData:
+                                flavor = ak.zeros_like(normalize(leading['pt'],cut))
+                            else:
+                                flavor = normalize(leading.hadronFlavour,cut)
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   flav = flavor,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)  
+                        elif 'subleading_jetflav_' in histname:
+                            #print(dir(subleading))
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(subleading[histname.replace('subleading_jetflav_','')],
+                                                   cut) for l in names if l in dir(subleading)}
+                            if isRealData:
+                                flavor = ak.zeros_like(normalize(subleading['pt'],cut))
+                            else:
+                                flavor = normalize(subleading.hadronFlavour,cut)
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   flav = flavor,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)  
+                        elif 'lep1_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(lep1cut[histname.replace('lep1_','')]),
+                                                      np.nan) for l in names if l in dir(lep1cut)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)
+                        elif 'lep2_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(lep2cut[histname.replace('lep2_','')]),
+                                                      np.nan) for l in names if l in dir(lep2cut)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf)
+                        #elif 'MET_' in histname:
+                        #    fields = {l: normalize(events.MET[histname.replace('MET_','')],
+                        #                           cut) for l in names if l in dir(events.MET)}
+                        #    h.fill(
+                        #           datasetSplit = dataset_renamed,
+                        #           lepflav = ch,
+                        #           region = r,
+                        #           **fields,
+                        #           weight = weights.weight()[cut] * lepsf) 
+                        elif 'll_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: ak.fill_none(flatten(llcut[histname.replace('ll_','')]),
+                                                      np.nan) for l in names if l in dir(llcut)}
+                            #print(max(llcut['pt']))
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf) 
+                        elif 'jj_' in histname:
+                            names = [ax.name for ax in h.axes]
+                            fields = {l: normalize(higgs_cand[histname.replace('jj_','')],
+                                                   cut) for l in names if l in dir(higgs_cand)}
+                            h.fill(
+                                   datasetSplit = dataset_renamed,
+                                   lepflav = ch,
+                                   region = r,
+                                   **fields,
+                                   weight = weights.weight()[cut] * lepsf) 
+                        else:
+                            output['nj'].fill(
+                                              datasetSplit = dataset_renamed,
+                                              lepflav = ch,
+                                              region = r,
+                                              nj = normalize(ak.num(jet_conditions),cut),
+                                              weight = weights.weight()[cut]*lepsf)
+                            # check?
+                            output['nAddJets302p5_puid'].fill(
+                                                              datasetSplit = dataset_renamed,
+                                                              lepflav = ch,
+                                                              region = r,
+                                                              nAddJets302p5_puid = normalize(ak.where((ak.num(jet_conditions) > 2),
+                                                                                                    (ak.num(jet_conditions)-2),
+                                                                                                    (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                    ),
+                                                                                             cut),
+                                                              weight = weights.weight()[cut]*lepsf)
+                            # check?
+                            output['nAddJetsFSRsub302p5_puid'].fill(
+                                                                    datasetSplit = dataset_renamed,
+                                                                    lepflav = ch,
+                                                                    region = r,
+                                                                    nAddJetsFSRsub302p5_puid = normalize(ak.where((ak.where((ak.num(jet_conditions) > 2),
+                                                                                                                            (ak.num(jet_conditions)-2),
+                                                                                                                            (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                                            )
+                                                                                                                     -ak.num((~jet_conditions) & (fsr_conditions))) > 0, 
+                                                                                                                 (ak.where((ak.num(jet_conditions) > 2),
+                                                                                                                            (ak.num(jet_conditions)-2),
+                                                                                                                            (ak.zeros_like(ak.num(jet_conditions)))
+                                                                                                                            )
+                                                                                                                     -ak.num((~jet_conditions) & (fsr_conditions))),
+                                                                                                                 (ak.zeros_like(ak.num(jet_conditions)))),
+                                                                                                         cut),
+                                                                    weight = weights.weight()[cut]*lepsf)
+                            #if not isRealData:
+                            #    output['weight_full'].fill(
+                            #                               datasetSplit = dataset_renamed,
+                            #                               lepflav = ch,
+                            #                               region = r,
+                            #                               weight_full = weights.weight()[cut]*lepsf)
+                            #    output['genweight'].fill(
+                            #                             datasetSplit = dataset_renamed,
+                            #                             lepflav = ch,
+                            #                             region = r,
+                            #                             genWeight = events.genWeight[cut])
+                            #    output['sign_genweight'].fill(
+                            #                                  datasetSplit = dataset_renamed,
+                            #                                  lepflav = ch,
+                            #                                  region = r,
+                            #                                  genWeight_by_abs = (events.genWeight/abs(events.genWeight))[cut])
+                            output['jjVPtRatio'].fill(
+                                                      datasetSplit = dataset_renamed,
+                                                      lepflav = ch,
+                                                      region = r,
+                                                      jjVPtRatio = (normalize(higgs_cand['pt'],
+                                                                              cut) / ak.fill_none(flatten(llcut['pt']),
+                                                                                                  np.nan)),
+                                                      weight = weights.weight()[cut] * lepsf)
+                        if self._export_array and not isRealData:
+                            import pandas as pd
+                            #output['array'][dataset]['weight'] += processor.column_accumulator(
+                            #                                        ak.to_numpy(weights.weight()[cut] * lepsf)
+                            #                                      )
+                                
+                            list_weights.append(ak.to_numpy(weights.weight()[cut] * lepsf))
+                                
+                            roi = ['SR_2LL','SR_2LH']
+                            lepflav_chosen = ['ee','mumu']
+                            names_dict = {'wei': weights.weight()[cut]* lepsf , #weights.weight()[cut] * lepsf
+                                          'Higgs_mass':  higgs_cand['mass'][cut],
+                                          'Higgs_pt': higgs_cand['pt'][cut],
+                                          'Z_mass': ll_cand['mass'][cut],
+                                          'Z_mass_gen': ll_cand_gen['mass'][cut],
+                                          'Z_pt_gen': ll_cand_gen['pt'][cut],
+                                          'Z_pt': ll_cand['pt'][cut],
+                                          'jjVptratio': (higgs_cand['pt'][cut])/ (ll_cand['pt'][cut]),
+                                          'CvsL_max': leading_with_fsr['jet1']['btagDeepFlavCvL'][cut],
+                                          'CvsL_min': subleading_with_fsr['jet1']['btagDeepFlavCvL'][cut], 
+                                          'CvsB_max': leading_with_fsr['jet1']['btagDeepFlavCvB'][cut],
+                                          'CvsB_min': subleading_with_fsr['jet1']['btagDeepFlavCvB'][cut],
+                                          'pt_lead': leading_with_fsr['jet1']['pt'][cut],
+                                          'pt_sublead': subleading_with_fsr['jet1']['pt'][cut],
+                                          'del_phi_jjV': np.abs((higgs_cand[cut]).delta_phi(ll_cand[cut])),
+                                          'del_R_jj': np.abs((higgs_cand['jet1'][cut]).delta_r(higgs_cand['jet2'][cut])), 
+                                          'del_eta_jj': np.abs((higgs_cand['jet1']['eta'][cut]) - (higgs_cand['jet2']['eta'][cut])),
+                                          'del_phi_jj': np.abs((higgs_cand['jet1'][cut]).delta_phi(higgs_cand['jet2'][cut])),
+                                          'del_phi_ll': np.abs((ll_cand['lep1'][cut]).delta_phi(ll_cand['lep2'][cut])), 
+                                          'del_eta_ll': np.abs((ll_cand['lep1']['eta'][cut]) - (ll_cand['lep2']['eta'][cut])),
+                                          'del_phi_l2_subleading': np.abs((ll_cand['lep2'][cut]).delta_phi(higgs_cand['jet2'][cut])),
+                                          'del_phi_l2_leading': np.abs((ll_cand['lep2'][cut]).delta_phi(higgs_cand['jet1'][cut]))
+                                          }
+                            if ch in lepflav_chosen and r in roi:
+                                if ch == 'ee':
+                                    if r == 'SR_2LL':
+                                        for var_name, var_value in names_dict.items():
+                                            lists_of_vars[f'{var_name}_low_ee'].append(ak.to_numpy(var_value))
+                                            #output['array'][dataset][f'{var_name}_low_ee'] += processor.column_accumulator(
+                                            #                        ak.to_numpy(var_value)
+                                            #                     )
+                                    elif r == 'SR_2LH':
+                                        for var_name, var_value in names_dict.items():
+                                            lists_of_vars[f'{var_name}_high_ee'].append(ak.to_numpy(var_value))
+                                            #output['array'][dataset][f'{var_name}_high_ee'] += processor.column_accumulator(
+                                            #                        ak.to_numpy(var_value)
+                                            #                      )
+                                elif ch == 'mumu':
+                                    if r == 'SR_2LL':
+                                        for var_name, var_value in names_dict.items():
+                                            lists_of_vars[f'{var_name}_low_mumu'].append(ak.to_numpy(var_value))
+                                            #output['array'][dataset][f'{var_name}_low_mumu'] += processor.column_accumulator(
+                                            #                        ak.to_numpy(var_value)
+                                            #                      )
+                                    elif r == 'SR_2LH':
+                                        for var_name, var_value in names_dict.items():
+                                            lists_of_vars[f'{var_name}_high_mumu'].append(ak.to_numpy(var_value))
+                                            #output['array'][dataset][f'{var_name}_high_mumu'] += processor.column_accumulator(
+                                            #                        ak.to_numpy(var_value)
+                                            #                      )
+                                            
+                                
+                                
+                                               ### + regression, kinfit ???
+        list_weights = np.array([item for sublist in list_weights for item in sublist])
+        #print(list_weights)
+        #print(lists_of_vars)
+        for v_name in lists_of_vars.keys():
+            lists_of_vars[v_name] = np.array([item for sublist in lists_of_vars[v_name] for item in sublist])
+        #print(lists_of_vars)
+        
+        '''
+        try:
+            df_weights = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv')
+        except FileNotFoundError:
+            df_weights = pd.DataFrame([], columns = ['weights'])
+        '''
+        try:
+            weights_array = np.load(f'{folder_save}/{dataset}/{filename}/test_weights_full.npy')
+        except FileNotFoundError:
+            weights_array = np.array([])
+        
+        df_wei = pd.DataFrame([], columns = ['weights'])
+        df_wei['weights'] = list_weights
+        weight = np.array(list_weights)
+        
+        
+        #df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True)
+        #df_wei.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights.csv', sep=',', encoding='utf-8', index=False)
+        #df_weights_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv', sep=',', encoding='utf-8', index=False)
+        weights_full = np.concatenate((weights_array, weight), axis = None)
+        np.save(f'{folder_save}/{dataset}/{filename}/test_weights_full.npy', weights_full, allow_pickle = False)
+        
+        '''
+        try:
+            df_else_everything = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv')
+        except FileNotFoundError:
+            df_else_everything = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()])
+        df_else = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()])
+        #print(df_else)
+        for var in lists_of_vars.keys():
+            df_else[var] = pd.Series(lists_of_vars[var])
+        '''
+
+
+        for var in lists_of_vars.keys():
+            try:
+                else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy')
+            except FileNotFoundError:
+                else_var_array = np.array([])
+            finally:
+                else_v_curr_array = np.array(lists_of_vars[var])
+                else_var_full_array = np.concatenate((else_var_array, else_v_curr_array), axis = None)
+                np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy', else_var_full_array, allow_pickle = False)
+            
+        #df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True)
+        
+        #df_else.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights.csv', sep=',', encoding='utf-8', index=False)
+        #df_else_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv', sep=',', encoding='utf-8', index=False)                                              
+                    
+        return {dataset: output}
+
+    def postprocess(self, accumulator):
+        #print(accumulator)
+        return accumulator
diff --git a/cfg_VHcc_mod.py b/cfg_VHcc_mod.py
new file mode 100644
index 0000000..b9d8e20
--- /dev/null
+++ b/cfg_VHcc_mod.py
@@ -0,0 +1,163 @@
+# Local Variables:
+# python-indent-offset: 4
+# End:
+
+from VHcc.workflows.Zll_process_newHist_pandas_small_update_isolation import (
+    NanoProcessor as VH_Zll,
+)
+
+cfg = {
+    "userconfig": {'version':'test_nolepsf'},
+    "dataset": {
+        "jsons": [
+                  "src/VHcc/metadata/bg_rwth_test_new_10.json"],
+        "campaign": "UL17",
+        "year": "2017",
+        "filter": {
+            "samples": [
+                "DYJetsToLL_nlo_vau_bg"
+                #"ZHToCC_vau_sig"
+                #"DYJetsToLL_nlo",
+                #"DY1ToLL_PtZ-250To400",
+                #"DY1ToLL_PtZ-50To150",
+                #"DY1ToLL_PtZ-150To250",
+                #"DY1ToLL_PtZ-400ToInf",
+                #"DY2ToLL_PtZ-50To150",
+                #"DY2ToLL_PtZ-150To250",
+                #"DY2ToLL_PtZ-250To400",
+                #"DY2ToLL_PtZ-400ToInf",
+                #"",
+                #"",
+                #"",
+            ],
+            "samples_exclude": [],
+        },
+    },
+    # Input and output files
+    "workflow": VH_Zll,
+    "output": "output_vhcc_zll",
+    "run_options": {
+        #"executor": "parsl/condor/naf_lite",
+        "executor": "parsl/condor",
+        #"executor": "futures",
+        "workers": 1,
+        "scaleout": 150,
+        "walltime": "01:00:00",
+        "mem_per_worker": 2,  # GB
+        "chunk": 50000,
+        "max": None,
+        "skipbadfiles": True,
+        "voms": None,
+        "limit": 80,
+        "retries": 20,
+        "splitjobs": False,
+        "requirements": (
+            '( Machine != "lx1b02.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a03.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a05.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a06.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a09.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a13.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a14.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a15.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a23.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a25.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a27.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a46.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a44.physik.rwth-aachen.de") && '
+            '( Machine != "lx3a47.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a55.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3a56.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b08.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b09.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b13.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b18.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b24.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b29.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b32.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b33.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b34.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b41.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b46.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b47.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b48.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b49.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b52.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b55.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b57.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b62.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b66.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b68.physik.RWTH-Aachen.de") && '
+            '( Machine != "lx3b69.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b70.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b71.physik.rwth-aachen.de") && '
+            '( Machine != "lx3b99.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade01.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade02.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade03.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade04.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade05.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade06.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade07.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade08.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade09.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade10.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade11.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade12.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade13.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade14.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade15.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade16.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade17.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade18.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade19.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade20.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade21.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade22.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade23.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade24.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade25.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade26.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade27.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade28.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade29.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade30.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxblade31.physik.rwth-aachen.de") && '
+            '( Machine != "lxblade32.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip01.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip02.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip05.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip06.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip09.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip10.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip11.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip12.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip14.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip15.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip16.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip17.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip18.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip19.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip24.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip25.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip26.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip27.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip28.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip29.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip30.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip31.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip32.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip34.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip35.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip50.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip51.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip52.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip53.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip54.physik.RWTH-Aachen.de") && '
+            '( Machine != "lxcip55.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip56.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip57.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip58.physik.rwth-aachen.de") && '
+            '( Machine != "lxcip59.physik.rwth-aachen.de")'),
+    },
+}
diff --git a/xgb_test.py b/xgb_test.py
new file mode 100644
index 0000000..80ccb58
--- /dev/null
+++ b/xgb_test.py
@@ -0,0 +1,456 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_03_14'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+#########################################################################
+######### Reading the bg as output and signal as signal #################
+#########################################################################
+output_names = ['output_vhcc_zll_v81_bg_5_files_1_chunk', 'output_vhcc_zll_v81_bg_5_files_2_chunk', 'output_vhcc_zll_v81_bg_5_files_3_chunk', 'output_vhcc_zll_v81_bg_5_files_4_chunk', 'output_vhcc_zll_v81_bg_5_files_5_chunk', 'output_vhcc_zll_v81_bg_5_files_6_chunk', 'output_vhcc_zll_v81_bg_5_files_7_chunk', 'output_vhcc_zll_v81_bg_5_files_8_chunk', 'output_vhcc_zll_v81_bg_5_files_9_chunk', 'output_vhcc_zll_v81_bg_6_files_10_chunk']
+signal_names = ['output_vhcc_zll_v54_signal_35_files']
+
+outputs = [load(f"{name}/output.coffea") for name in output_names]
+signals = [load(f"{name}/output.coffea") for name in signal_names]
+
+outputs = [out['DYJetsToLL_nlo_vau_bg'] for out in outputs]
+signals = [sig['ZHToCC_vau_sig'] for sig in signals]
+
+
+output=load('output_vhcc_zll_v47_bg_2_files/output.coffea')
+signal=load('output_vhcc_zll_v54_signal_5_files/output.coffea')  
+
+output = output['DYJetsToLL_nlo_vau_bg']
+signal = signal['ZHToCC_vau_sig']
+#print(output['array'])
+#########################################################################
+########## Testing bg to see the structure ##############################
+#########################################################################
+for f in output['array'].keys():
+    print(f)
+    try:
+        for k in f.keys():
+            print(k)
+    except AttributeError:
+      print("No keys found")
+#########################################################################
+
+
+#########################################################################
+######### Testing sig to see the structure ##############################
+#########################################################################
+for f in signal['array'].keys():
+    print(f)
+    try:
+        for k in f.keys():
+            print(k)
+    except AttributeError:
+      print("No keys found")
+#########################################################################
+
+#########################################################################
+###### Reading the arrays into collect_var dictionary for sig ###########
+#########################################################################
+names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+def output_collect_sig(sig):  
+    sumw_sig = {}
+    collect_var_sig={}
+    varlist_sig = ['weight']
+
+    names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+    for name in names_sig:
+        varlist_sig.append(f'{name}_low_ee')
+        varlist_sig.append(f'{name}_high_ee')
+        varlist_sig.append(f'{name}_low_mumu')
+        varlist_sig.append(f'{name}_high_mumu')
+
+
+    for s in sig['array'].keys():
+        #iterated samples inside coffea file
+        if s not in sumw_sig.keys():sumw_sig[s]=sig['array'][s]['sumw']
+        else:sumw_sig[s] += sig['array'][s]['sumw']
+    
+        if s not in collect_var_sig.keys():collect_var_sig[s]={}
+        # iterate regions(SR, CR, for H+c)
+        for var in varlist_sig:
+            # get arrays for each variable
+            if var=='BDT' : continue
+            if var not in list(collect_var_sig[s].keys()):collect_var_sig[s][var]=sig['array'][s][var].value
+            else:collect_var_sig[s][var]=np.concatenate((collect_var_sig[s][var],sig['array'][s][var].value))
+    
+    #print(sumw)
+    #print(collect_var)
+    for var in collect_var_sig.keys():
+        #print(var)
+        for key in collect_var_sig[var].keys():
+            #print(key)
+            #print(collect_var_sig[var][key])
+            #print(len(collect_var_sig[var][key]))
+            pass
+    return varlist_sig, collect_var_sig
+big_signal_varlist = []
+big_signal_variable_collection = []
+for coffea in signals:
+    varlist_sig, collect_var_sig = output_collect_sig(coffea)
+    big_signal_varlist.append(varlist_sig)
+    big_signal_variable_collection.append(collect_var_sig)
+
+varlist_sig, collect_var_sig = output_collect_sig(signal)
+
+#print(varlist_sig)
+#print(big_signal_varlist)
+#print(collect_var_sig)
+#print(big_signal_variable_collection)
+#########################################################################
+
+
+#########################################################################
+###### Reading the arrays into collect_var dictionary for bg ############
+#########################################################################
+names_bg = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+         'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+         'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+         'del_phi_l2_subleading', 'del_phi_l2_leading']
+def output_collect_bg(bg_file): 
+    sumw_bg = {}
+    collect_var_bg={}
+    varlist_bg = ['weight']
+
+    names_bg = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                'del_phi_l2_subleading', 'del_phi_l2_leading']
+    for name in names_bg:
+        varlist_bg.append(f'{name}_low_ee')
+        varlist_bg.append(f'{name}_high_ee')
+        varlist_bg.append(f'{name}_low_mumu')
+        varlist_bg.append(f'{name}_high_mumu')
+
+
+    for s in bg_file['array'].keys():
+        #iterated samples inside coffea file
+        if s not in sumw_bg.keys():sumw_bg[s]=bg_file['array'][s]['sumw']
+        else:sumw_bg[s] += bg_file['array'][s]['sumw']
+    
+        if s not in collect_var_bg.keys():collect_var_bg[s]={}
+        # iterate regions(SR, CR, for H+c)
+        for var in varlist_bg:
+            # get arrays for each variable
+            if var=='BDT' : continue
+            if var not in list(collect_var_bg[s].keys()):collect_var_bg[s][var]=bg_file['array'][s][var].value
+            else:collect_var_bg[s][var]=np.concatenate((collect_var_bg[s][var],bg_file['array'][s][var].value))
+    
+    #print(sumw)
+    #print(collect_var)
+    for var in collect_var_bg.keys():
+        #print(collect_var[var])
+        for key in collect_var_bg[var].keys():
+            print(key)
+            #print(collect_var[var][key])
+            #print(len(collect_var[var][key]))
+    return varlist_bg, collect_var_bg
+varlist_bg, collect_var_bg = output_collect_bg(output)
+big_bg_varlist = []
+big_bg_variable_collection = []
+for coffea in outputs:
+    varlist_bg, collect_var_bg = output_collect_bg(coffea)
+    big_bg_varlist.append(varlist_bg)
+    big_bg_variable_collection.append(collect_var_bg)
+
+#print(varlist_bg)
+#print(big_bg_varlist)
+#print(collect_var_bg)
+#print(big_bg_variable_collection)
+#########################################################################
+
+
+#########################################################################
+## Mergemap - dictionary with files, associated with their categories ###
+#########################################################################
+mergemap={'signal': ['ZHToCC_vau_sig'], 'bg': ['DYJetsToLL_nlo_vau_bg']}
+trainvar = []
+#for s in varlist_sig:
+#    trainvar.append(f'{s}_signal')
+#for b in varlist_bg:
+#    trainvar.append(f'{b}_background')
+trainvar = varlist_sig
+#print(trainvar)
+MCvar={}
+weivar={}
+
+for var in trainvar :
+    MCbkgLM = []
+    MCvar[var]={}
+    for m in mergemap:
+        tmpml = []
+        tmpwei = []
+        if m == 'signal':
+            for colvarsig in big_signal_variable_collection:
+                tmpml=np.concatenate((tmpml,colvarsig[mergemap[m][0]][var])) 
+            
+                tmpwei=np.concatenate((tmpwei,colvarsig[mergemap[m][0]]['weight']))
+        elif m == 'bg':
+            for colvarbag in big_bg_variable_collection:
+                tmpml=np.concatenate((tmpml,colvarbag[mergemap[m][0]][var])) 
+            
+                tmpwei=np.concatenate((tmpwei,colvarbag[mergemap[m][0]]['weight'])) 
+        MCvar[var][m]=tmpml
+        weivar[m]=tmpwei
+        MCbkgLM+=[tmpml]
+
+print(MCvar.keys())
+print(MCvar['Higgs_mass_low_ee'].keys())
+len_var = []
+len_var_bg = []
+
+df_sig = pd.DataFrame([], columns = [f'{col}_low_ee' for col in names_sig])
+print(df_sig)
+for var in MCvar.keys():
+    if '_low_ee' in var:
+        len_var.append(len(MCvar[var]['signal']))
+        df_sig[var] = MCvar[var]['signal']
+        df_sig['target'] = np.ones(np.max(len_var))
+print(df_sig)
+print(np.max(len_var), np.min(len_var))
+
+
+df_bg = pd.DataFrame([], columns = [f'{col}_low_ee' for col in names_sig])
+print(df_bg)
+for var in MCvar.keys():
+    if '_low_ee' in var:
+        len_var_bg.append(len(MCvar[var]['bg']))
+        df_bg[var] = MCvar[var]['bg']
+        df_bg['target'] = np.zeros(np.max(len_var_bg))
+print(df_bg)
+print(np.max(len_var_bg), np.min(len_var_bg))
+
+df = pd.concat([df_sig, df_bg], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv('xgb_training_dataset_low_ee.csv', sep=',', encoding='utf-8', index=False)
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(e_1, e_2)$', '$\Delta\eta(e_1, e_2)$',
+                 '$\Delta\Phi (e_{subleading}, jet_{subleading})$', '$\Delta\Phi (e_{subleading}, jet_{leading})$'] 
+
+c = 0
+for col in names_sig[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_low_ee'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_low_ee'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_low_ee'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_low_ee'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{col}_low_ee.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_low_ee'][:len_sig]),bins = 80, weights = np.array(df['wei_low_ee'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_low_ee'][len_sig:]),bins =80, weights = np.array(df['wei_low_ee'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_low_ee'][:len_sig]),bins = 80, weights = np.array(df['wei_low_ee'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_low_ee'][len_sig:]),bins =80, weights = np.array(df['wei_low_ee'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} low ee')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_low_ee'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"plot/{folder_save}/compare_{col}_low_ee.pdf")
+    fig.savefig(f"plot/{folder_save}/compare_{col}_low_ee.jpg")
+    c += 1
+    
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop("wei_low_ee", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea.py b/xgb_test_no_coffea.py
new file mode 100644
index 0000000..16547aa
--- /dev/null
+++ b/xgb_test_no_coffea.py
@@ -0,0 +1,605 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_04_19_later'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+paths_np = [str(x) for x in Path("./condor_signal_04_mid/ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+#print(key_np)
+for key in key_np.keys():
+    key_np[key] = [np.load(element) for element in key_np[key]]
+#print(key_np)
+
+key_np_full = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+#print(key_np_full)
+
+for key in key_np_full.keys():                 
+    df_sig_full_np[key] = pd.Series(key_np_full[key])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+print(df_s_new_np)
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np_back)
+print(len(paths_np_back))
+df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_back_full_np)
+
+key_np_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_back[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np_back:
+            if f'{col}_{rois}' in path:
+                key_np_back[f'{col}_{rois}'].append(path)
+#print(key_np_back)
+for key in key_np_back.keys():
+    key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+#print(key_np_back)
+
+key_np_full_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_full_back[f'{col}_{rois}'] = np.array([])
+for key in key_np_full_back.keys():
+    key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+#print(key_np_full_back)
+
+for key in key_np_full_back.keys():                 
+    df_back_full_np[key] = pd.Series(key_np_full_back[key])
+print(df_back_full_np)
+df_b_new_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+df_b_new_np = df_b_new_np.dropna()
+print(df_b_new_np)
+
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+    df_b_new_np['target'] = np.zeros(np.max(len_var))
+print(df_b_new_np)
+
+######################################################################################
+
+df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+c = 0
+for col in names_sig[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(net_path +f"plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.pdf")
+    fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.pdf")
+    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.pdf")
+    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_only_xgb.py b/xgb_test_only_xgb.py
new file mode 100644
index 0000000..7d4307a
--- /dev/null
+++ b/xgb_test_only_xgb.py
@@ -0,0 +1,361 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+#######################################################################################
+## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+#######################################################################################
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_04_11'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+df = pd.read_csv('xgb_training_dataset_low_ee.csv')
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+
+########################################################################################
+########## drop target from df and bring it to a separate column, drop weights #########
+########################################################################################
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop("wei_low_ee", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+########################################################################################
+################# GRID search attempt ##################################################
+########################################################################################
+'''
+from sklearn.model_selection import GridSearchCV
+
+### Creat the parameter grid
+gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+
+gbm = xgb.XGBRegressor()
+
+grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+
+grid_mse.fit(X,y)
+
+
+print("Best parameters found: ", grid_mse.best_params_)
+print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+'''
+
+########################################################################################
+############# An attempt to do hyperparameter tuning for the classifier fit ############
+########################################################################################
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+#########################################################################################
+############# Create pipelines for xgb training #########################################
+#########################################################################################
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+#########################################################################################
+############ split dataset into training and test #######################################
+#########################################################################################
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+############################################################################################################
+######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+############################################################################################################
+cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101)
+folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+#print(folds)
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+results = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+eta = 0.4
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta}
+with open(f"plot/{folder_save}/results_first.json", 'w') as outfile:
+    json.dump(results, outfile)
+
+
+
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+for train, test in tqdm(folds, total = len(folds)):
+    print('train')
+    dtrain = xgb.DMatrix(X_train[train,:],
+             label = y_train[train])
+    dval = xgb.DMatrix(X_train[test, :], label = y_train[test])
+    model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal
+    sets = [dtrain, dval, dtest]
+    for i, ds in enumerate(results.keys()):
+        print(i)
+        y_preds = model.predict(sets[i])
+        labels = sets[i].get_label()
+        fpr, tpr, thresholds = roc_curve(labels, y_preds)
+        results[ds]['fpr'].append(fpr)
+        results[ds]['tpr'].append(tpr)
+        results[ds]['thresholds'].append(thresholds)
+        results[ds]['auc'].append(roc_auc_score(labels, y_preds))  
+
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+with open(f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+##########################################################################################################
+############## plotting the ROC curves with uncertainties ################################################
+##########################################################################################################
+kind = 'val'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff.jpg")
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff.pdf")
+
+'''
+fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg")
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf")
+'''
+##################################################################################################
+########## Actual hyperparameter tuning ##########################################################
+##################################################################################################
+
+trials = Trials()
+
+#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+#print("The best hyperparameters are: ", "\n")
+#print(best_hyperparams)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance.jpg")
+
+feature_importance = model.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_no_coffea.py b/xgb_test_only_xgb_no_coffea.py
new file mode 100644
index 0000000..dd3b1d1
--- /dev/null
+++ b/xgb_test_only_xgb_no_coffea.py
@@ -0,0 +1,399 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+#######################################################################################
+## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+#######################################################################################
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_04_11'
+roi = 'low_mumu'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(net_path + f'xgb_training_dataset_{roi}.csv')
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+
+########################################################################################
+########## drop target from df and bring it to a separate column, drop weights #########
+########################################################################################
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+########################################################################################
+################# GRID search attempt ##################################################
+########################################################################################
+'''
+from sklearn.model_selection import GridSearchCV
+
+### Creat the parameter grid
+gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+
+gbm = xgb.XGBRegressor()
+
+grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+
+grid_mse.fit(X,y)
+
+
+print("Best parameters found: ", grid_mse.best_params_)
+print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+'''
+
+########################################################################################
+############# An attempt to do hyperparameter tuning for the classifier fit ############
+########################################################################################
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+#########################################################################################
+############# Create pipelines for xgb training #########################################
+#########################################################################################
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+#########################################################################################
+############ split dataset into training and test #######################################
+#########################################################################################
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+############################################################################################################
+######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+############################################################################################################
+cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101)
+folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+#print(folds)
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+results = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_zero_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_weak_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+eta = 0.3
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta}
+with open(net_path + f"plot/{folder_save}/results_first.json", 'w') as outfile:
+    json.dump(results, outfile)
+
+
+
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+for train, test in tqdm(folds, total = len(folds)):
+    print('train')
+    dtrain = xgb.DMatrix(X_train[train,:],
+             label = y_train[train])
+    dval = xgb.DMatrix(X_train[test, :], label = y_train[test])
+    model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal
+    model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal
+    model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal
+    sets = [dtrain, dval, dtest]
+    for i, ds in enumerate(results.keys()):
+        print(i)
+        y_preds = model.predict(sets[i])
+        y_preds_zero_train = model_zero_train.predict(sets[i])
+        y_preds_weak_train = model_weak_train.predict(sets[i])
+        labels = sets[i].get_label()
+        fpr, tpr, thresholds = roc_curve(labels, y_preds)
+        fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train)
+        fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train)
+        results[ds]['fpr'].append(fpr)
+        results[ds]['tpr'].append(tpr)
+        results[ds]['thresholds'].append(thresholds)
+        results[ds]['auc'].append(roc_auc_score(labels, y_preds)) 
+        results_zero_train[ds]['fpr'].append(fpr_zero)
+        results_zero_train[ds]['tpr'].append(tpr_zero)
+        results_zero_train[ds]['thresholds'].append(thresholds_zero)
+        results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) 
+        results_weak_train[ds]['fpr'].append(fpr_weak)
+        results_weak_train[ds]['tpr'].append(tpr_weak)
+        results_weak_train[ds]['thresholds'].append(thresholds_weak)
+        results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train))   
+
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+with open(net_path + f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(net_path + f"plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(net_path + f"plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+##########################################################################################################
+############## plotting the ROC curves with uncertainties ################################################
+##########################################################################################################
+kind = 'val'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.jpg")
+fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.pdf")
+
+'''
+fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg")
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf")
+'''
+##################################################################################################
+########## Actual hyperparameter tuning ##########################################################
+##################################################################################################
+
+trials = Trials()
+
+#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+#print("The best hyperparameters are: ", "\n")
+#print(best_hyperparams)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+
+feature_importance = model.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded.py b/xgb_test_only_xgb_reloaded.py
new file mode 100644
index 0000000..9f3d72d
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded.py
@@ -0,0 +1,294 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+folder_save = 'eval_23_03_07_1'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+df = pd.read_csv('xgb_training_dataset_low_ee.csv')
+
+
+learning_rate = 0.3
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop("wei_low_ee", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+with open(f"plot/{folder_save}/results_lr_{learning_rate}.json") as user_file:
+    file_contents = user_file.read()
+
+results = json.loads(file_contents)
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+#kind = 'val'
+kind = 'test'
+#kind = 'train'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+range_plot_x = [0,1]
+range_plot_y = [0.2,1]
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = 'FPR (Background efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg")
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf")
+
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg")
+fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf")
+
+
+
+
+trials = Trials()
+
+#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+#print("The best hyperparameters are: ", "\n")
+#print(best_hyperparams)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(e_1, e_2)$', '$\Delta\eta(e_1, e_2)$',
+                 '$\Delta\Phi (e_{subleading}, jet_{subleading})$', '$\Delta\Phi (e_{subleading}, jet_{leading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea.py b/xgb_test_only_xgb_reloaded_no_coffea.py
new file mode 100644
index 0000000..bc91384
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea.py
@@ -0,0 +1,287 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_05_02'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv')
+
+roi = 'low_mumu'
+learning_rate = 0.3
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg")
+    fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf")
+
+
+pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full")
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero')
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak')
+
+##############################################################################################################################################################
+
+
+trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+### Fit
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(net_path + f"plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg")
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_var.py b/xgb_test_only_xgb_reloaded_no_coffea_var.py
new file mode 100644
index 0000000..4451ef9
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea_var.py
@@ -0,0 +1,404 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_05_02'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv')
+
+roi = 'low_mumu'
+learning_rate = 0.3
+
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+var = f'Higgs_mass_{roi}'
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+X = df[var]
+print(X)
+print(X.info())
+
+X_signal = df[var][df.target == 1]
+X_bg = df[var][df.target == 0]
+
+y = df["target"]
+print(y)
+
+y_signal = df["target"][df.target == 1]
+y_bg = df["target"][df.target == 0]
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = pd.Index([], dtype = 'object')
+num_cols = pd.Index([var], dtype = 'object')
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+#X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1))
+y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218)
+X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218)
+X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type, var):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg")
+    fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf")
+
+
+pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full", var)
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero', var)
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak', var)
+
+##############################################################################################################################################################
+
+
+trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10)
+
+### Fit
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+dtest = xgb.DMatrix(X_test, label = y_test)
+dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig)
+dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000,
+model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_new_weak = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_new_zero = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    y_preds_new_weak = model_xgb_weak.predict(sets[i])
+    y_preds_new_zero = model_xgb_zero.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak)
+    fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))
+    results_new_weak[ds]['fpr'].append(fpr_new_weak)
+    results_new_weak[ds]['tpr'].append(tpr_new_weak)
+    results_new_weak[ds]['thresholds'].append(thresholds_new_weak)
+    results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) 
+    results_new_zero[ds]['fpr'].append(fpr_new_zero)
+    results_new_zero[ds]['tpr'].append(tpr_new_zero)
+    results_new_zero[ds]['thresholds'].append(thresholds_new_zero)
+    results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero))  
+
+def pretty_ROC_Curve_var(results, kind, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(1):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") 
+
+pretty_ROC_Curve_var(results_new, 'test', 'full', var)
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)])
+predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+#importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+#importances = importances.sort_values(by = "Importance", ascending = False)
+#importances = importances.set_index('Feature')
+#print(importances)
+#importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_{var}.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+'''names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']'''
+names_sig = ['m(H)'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False)
+plt.title('Classifier output')
+plt.legend(['Signal', 'Background'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_sig_vs_bg.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_weak.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_zero.jpg")
+
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+with open(f"plot/{folder_save}/ROC.txt", "a") as myfile:
+            myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + "  " + '\n')
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''

From f3cfb344bae12e22984f8fd802a5408138f58ad1 Mon Sep 17 00:00:00 2001
From: ValVau <109755950+ValVau@users.noreply.github.com>
Date: Wed, 31 May 2023 10:46:52 +0200
Subject: [PATCH 2/3] Add files via upload

---
 ...s_newHist_pandas_small_update_isolation.py | 241 +++++++++++++++++-
 1 file changed, 233 insertions(+), 8 deletions(-)

diff --git a/Zll_process_newHist_pandas_small_update_isolation.py b/Zll_process_newHist_pandas_small_update_isolation.py
index 11de771..d7e32d0 100644
--- a/Zll_process_newHist_pandas_small_update_isolation.py
+++ b/Zll_process_newHist_pandas_small_update_isolation.py
@@ -585,9 +585,9 @@ def process(self, events):
         nEvents = len(events)
         print('Number of events: ', nEvents)
         if 'ZH' in dataset:
-            ttyp = 'signal_04_mid'
+            ttyp = 'signal_05_late'
         else:
-            ttyp = 'back_04_mid'
+            ttyp = 'back_05_late'
         folder_save = f'condor_{ttyp}'
         if not os.path.exists(f"./{folder_save}"):
             os.mkdir(f"./{folder_save}")
@@ -595,8 +595,7 @@ def process(self, events):
             os.mkdir(f"./{folder_save}/{dataset}")
         if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"):
             os.mkdir(f"./{folder_save}/{dataset}/{filename}")
-        with open(f"./{folder_save}/event_nr.txt", "a") as myfile:
-            myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + "  " + '\n')
+        
         
         # As far as I understand, this looks like a neat way to give selections a name,
         # while internally, there are boolean arrays for all events
@@ -786,7 +785,14 @@ def process(self, events):
         
         
         
-        
+        names_events = []
+        values_events = []
+        names_events.append("Filename")
+        names_events.append("Start")
+        names_events.append("Stop")
+        values_events.append(f"{filename}")
+        values_events.append(f"{start}")
+        values_events.append(f"{stop}")
         
         # =================================================================================
         #
@@ -802,21 +808,102 @@ def process(self, events):
         ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2
         #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)]
         event_mu = events.Muon
+        nEvent_mu = len(event_mu)
+        ###################################################################################
+        nEvent_mu = ak.sum(event_mu.looseId, axis = 1)
+        nEvent_mu = ak.sum((nEvent_mu == 2))
+        names_events.append("Number of 2 mu events")
+        values_events.append(nEvent_mu)
+        ###################################################################################
         # looseId >= 1 or looseId seems to be the same...
         musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25)) #(event_mu.looseId >= 1) (event_mu.mvaId >= 3)
         # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L
+        
+        ###################################################################################
+        ############### Cutflow every single cut ##########################################
+        ###################################################################################
+        n_event_mu_pt = ak.sum((event_mu.pt > 20), axis = 1) ###, axis = 1
+        n_event_mu_pt = ak.sum((n_event_mu_pt == 2))
+        names_events.append("Number of mu events pt cut")
+        values_events.append(n_event_mu_pt)
+        ###################################################################################        
+        n_event_mu_eta = ak.sum((abs(event_mu.eta) < 2.4), axis = 1)
+        n_event_mu_eta = ak.sum((n_event_mu_eta == 2))
+        names_events.append("Number of mu events eta cut")
+        values_events.append(n_event_mu_eta)
+        ###################################################################################
+        n_event_mu_looseId = ak.sum((event_mu.looseId >= 1), axis = 1) 
+        n_event_mu_looseId = ak.sum((n_event_mu_looseId == 2))
+        names_events.append("Number of mu events looseId cut")
+        values_events.append(n_event_mu_looseId)
+        ###################################################################################
+        n_event_mu_iso = ak.sum((event_mu.pfRelIso04_all<0.25), axis = 1)
+        n_event_mu_iso = ak.sum((n_event_mu_iso == 2))
+        names_events.append("Number of mu events iso cut")
+        values_events.append(n_event_mu_iso)
+        ###################################################################################
+
+        ###################################################################################
+        ############### Cutflow cuts applied gradually ####################################
+        ###################################################################################
+        musel_pt_eta = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4))
+        nmu_pt_eta = ak.sum(musel_pt_eta,axis=1)
+        names_events.append("Mu selection pt eta")
+        values_events.append(ak.sum(nmu_pt_eta == 2))
+        ###################################################################################
+        musel_plus_looseid = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1))
+        nmu_plus_looseid = ak.sum(musel_plus_looseid,axis=1)
+        names_events.append("Mu selection pt eta looseId")
+        values_events.append(ak.sum(nmu_plus_looseid == 2))
+        ###################################################################################
+           
         event_mu = event_mu[musel]
+        n_event_mu_sel = len(event_mu)
         event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)]
         event_mu["lep_flav"] = 13*event_mu.charge
         event_mu= ak.pad_none(event_mu,2,axis=1)
         nmu = ak.sum(musel,axis=1)
+        names_events.append("Final mu selection")
+        values_events.append(ak.sum(nmu == 2))
         # ToDo: PtCorrGeoFit
         
         # ## Electron cuts
         ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2
         #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)]
         event_e = events.Electron
+        nEvent_ele = len(event_e)
+        ###################################################################################
+        nEvent_ele = ak.sum((abs(event_e.pt)>=0), axis = 1)
+        nEvent_ele = ak.sum((nEvent_ele == 2))
+        names_events.append("Number of 2 ele events")
+        values_events.append(nEvent_ele)
+        ###################################################################################
         elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25))
+        ###################################################################################
+        
+        ###################################################################################
+        ############### Cutflow every single cut ##########################################
+        ###################################################################################
+        n_event_ele_pt = ak.sum((event_e.pt > 20), axis = 1) 
+        n_event_ele_pt = ak.sum((n_event_ele_pt == 2))
+        names_events.append("Number of ele events pt cut")
+        values_events.append(n_event_ele_pt)
+        ###################################################################################
+        n_event_ele_eta = ak.sum((abs(event_e.eta) < 2.5), axis = 1) 
+        n_event_ele_eta = ak.sum((n_event_ele_eta == 2))
+        names_events.append("Number of ele events eta cut")
+        values_events.append(n_event_ele_eta)
+        ###################################################################################
+        n_event_ele_mvaIso = ak.sum((event_e.mvaFall17V2Iso_WP90==1), axis = 1) 
+        n_event_ele_mvaIso = ak.sum((n_event_ele_mvaIso == 2))
+        names_events.append("Number of ele events mva Iso cut")
+        values_events.append(n_event_ele_mvaIso)
+        ###################################################################################
+        n_event_ele_pfrelIso = ak.sum((event_e.pfRelIso03_all<0.25), axis = 1) 
+        n_event_ele_pfrelIso = ak.sum((n_event_ele_pfrelIso == 2))
+        names_events.append("Number of ele events pf Rel Iso cut")
+        values_events.append(n_event_ele_pfrelIso)
+        ###################################################################################
         # but 30GeV and WP80 for 1L
         event_e = event_e[elesel]
         # something I saw in a recent presentation, and also in AT code:
@@ -828,7 +915,26 @@ def process(self, events):
         event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)]
         event_e["lep_flav"] = 11*event_e.charge
         event_e = ak.pad_none(event_e,2,axis=1)
+        
+        ###################################################################################
+        ############### Cutflow cuts applied gradually ####################################
+        ###################################################################################
+        ele_pt_eta = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5))
+        nele_pt_eta = ak.sum(ele_pt_eta,axis=1)
+        names_events.append("Ele selection pt eta")
+        values_events.append(ak.sum(nele_pt_eta == 2))
+        ###################################################################################
+        esel_plus_mvaId = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1))
+        nele_plus_mvaId = ak.sum(esel_plus_mvaId,axis=1)
+        names_events.append("Ele selection pt eta mvaId")
+        values_events.append(ak.sum(nele_plus_mvaId == 2))
+        ###################################################################################
+
+
+
         nele = ak.sum(elesel,axis=1)
+        names_events.append("Final ele selection")
+        values_events.append(ak.sum(nele == 2))
         # sorting after selecting should be faster (less computations on average)
         
         # for this channel (Zll / 2L)
@@ -1022,7 +1128,12 @@ def deepflavcvsbtag(jet):
         #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets)
         #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets)
         jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)]
-
+        ###################################################################################
+        nEvent_jets = ak.sum((jets.btagDeepFlavCvL>=0), axis = 1)
+        nEvent_jets = ak.sum((nEvent_jets >= 2))
+        names_events.append("Number of 2+ jet events")
+        values_events.append(nEvent_jets)
+        ###################################################################################
         
         # Jets are considered only if the following identification conditions hold, as mentioned in AN
         # - Here is some documentation related to puId and jetId:
@@ -1030,9 +1141,89 @@ def deepflavcvsbtag(jet):
         #     https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID
         jet_conditions = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
                      | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2)
+
+        ###################################################################################
+        ############### Cutflow every single cut ##########################################
+        ###################################################################################
+        n_event_jet_eta = ak.sum((abs(jets.eta) < 2.4), axis = 1) 
+        n_event_jet_eta = ak.sum((n_event_jet_eta >= 2))
+        names_events.append("Number of jet events eta cut")
+        values_events.append(n_event_jet_eta)
+        ###################################################################################
+        n_event_jet_pt = ak.sum((jets.pt > 20), axis = 1) 
+        n_event_jet_pt = ak.sum((n_event_jet_pt >= 2))
+        names_events.append("Number of jet events pt cut")
+        values_events.append(n_event_jet_pt)
+        ###################################################################################
+        n_event_jet_puId = ak.sum((jets.puId > 0), axis = 1) 
+        n_event_jet_puId = ak.sum((n_event_jet_puId >= 2))
+        names_events.append("Number of jet events puId cut")
+        values_events.append(n_event_jet_puId)
+        ###################################################################################
+        n_event_jet_pt_strong = ak.sum((jets.pt>50), axis = 1) 
+        n_event_jet_pt_strong = ak.sum((n_event_jet_pt_strong >= 2))
+        names_events.append("Number of jet events pt strong cut")
+        values_events.append(n_event_jet_pt_strong)
+        ###################################################################################
+        n_event_jet_jetId = ak.sum((jets.jetId>5), axis = 1) 
+        n_event_jet_jetId = ak.sum((n_event_jet_jetId >= 2))
+        names_events.append("Number of jet events jet_id cut")
+        values_events.append(n_event_jet_jetId)
+        ###################################################################################
+        n_event_jet_lepton_clean_1 = ak.sum(ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2), axis = 1) 
+        n_event_jet_lepton_clean_1 = ak.sum((n_event_jet_lepton_clean_1 >= 2))
+        names_events.append("Number of jet events lepton clean 1 cut")
+        values_events.append(n_event_jet_lepton_clean_1)
+        ###################################################################################
+        n_event_jet_lepton_clean_2 = ak.sum(ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis =2), axis = 1) 
+        n_event_jet_lepton_clean_2 = ak.sum((n_event_jet_lepton_clean_2 >= 2))
+        names_events.append("Number of jet events lepton clean 2 cut")
+        values_events.append(n_event_jet_lepton_clean_2)
+        ###################################################################################
+        
+        ###################################################################################
+        ############### Cutflow cuts applied gradually ####################################
+        ###################################################################################
+        jets_pt_eta = ((abs(jets.eta) < 2.4) & (jets.pt > 20))
+        njet_pt_eta = ak.sum(jets_pt_eta,axis=1)
+        names_events.append("Number of jets selection pt eta")
+        values_events.append(ak.sum(njet_pt_eta >= 2))
+        ###################################################################################
+        jsel_plus_puId = ((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0))
+        njets_plus_puId = ak.sum(jsel_plus_puId,axis=1)
+        names_events.append("Number of jets selection pt eta puId")
+        values_events.append(ak.sum(njets_plus_puId >= 2))
+        ###################################################################################
+        jsel_plus_jetId = ((jets.pt>50) & (jets.jetId>5))
+        njets_plus_jetId = ak.sum(jsel_plus_jetId,axis=1)
+        names_events.append("Number of jets selection pt jetId")
+        values_events.append(ak.sum(njets_plus_jetId >= 2))
+        ###################################################################################
+        jsel_no_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
+                     | ((jets.pt>50) & (jets.jetId>5)))
+        njets_no_cleaning = ak.sum(jsel_no_cleaning,axis=1)
+        names_events.append("Number of jets full selection no cleaning")
+        values_events.append(ak.sum(njets_no_cleaning >= 2))
+        ###################################################################################
+        jsel_one_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
+                     | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2)
+        njets_one_cleaning = ak.sum(jsel_one_cleaning,axis=1)
+        names_events.append("Number of jets full selection cleaning 1 lepton")
+        values_events.append(ak.sum(njets_one_cleaning >= 2))
+        ###################################################################################
+        jsel_two_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \
+                     | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis =2)
+        njets_two_cleaning = ak.sum(jsel_two_cleaning,axis=1)
+        names_events.append("Number of jets full selection cleaning 2 lepton")
+        values_events.append(ak.sum(njets_two_cleaning >= 2))
+        ###################################################################################
+
+        
         # Count how many jets exist that pass this selection
         njet = ak.sum(jet_conditions,axis=1)
         selection.add('jetsel',ak.to_numpy(njet>=2))
+        names_events.append("Number of jet events final cut")
+        values_events.append(ak.sum((njet >= 2)))
         
         
         # =================================================================================
@@ -1263,6 +1454,9 @@ def res(mval, out):
         selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high))
         
         
+        with open(f"./{folder_save}/event_nr.txt", "a") as myfile:
+            myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + "  " + '\n')
+            myfile.write(f"Nr of muon events in {filename} from {start} to {stop} with pt, eta, looseId, iso cuts : " + str(nEvent_mu) + " " +  str(n_event_mu_pt) + " " + str(n_event_mu_eta) + " " + str(n_event_mu_looseId) + " " + str(n_event_mu_iso) + " " + str(n_event_mu_pt) + ' ' + str(n_event_mu_sel) + "  " + '\n')
         
         
         
@@ -1645,6 +1839,37 @@ def res(mval, out):
         df_wei = pd.DataFrame([], columns = ['weights'])
         df_wei['weights'] = list_weights
         weight = np.array(list_weights)
+
+        try:
+            df_muons = pd.read_csv(f'{folder_save}/muons.csv')
+        except FileNotFoundError:
+            df_muons = pd.DataFrame([], columns = ['pt', 'looseid', 'looseid_cut'])
+        df_muons_this_file = pd.DataFrame([], columns = ['pt', 'looseid', 'looseid_cut'])
+        df_muons_this_file['looseid'] = pd.Series(np.array(ak.ravel(event_mu.looseId)))
+        df_muons_this_file['looseid_cut'] = pd.Series(np.array(ak.ravel((event_mu.looseId>2))))
+        df_muons_this_file['pt'] = pd.Series(np.array(ak.ravel(event_mu.pt)))
+        df_muons = pd.concat([df_muons, df_muons_this_file], ignore_index = True)
+        df_muons.to_csv(f'{folder_save}/muons.csv', sep=',', encoding='utf-8', index=False)
+        
+        try:
+            df_cutflow = pd.read_csv(f'{folder_save}/cutflow.csv')
+        except FileNotFoundError:
+            df_cutflow = pd.DataFrame([], columns = names_events)
+
+        
+        
+        
+        elements_start = df_cutflow["Start"]
+        df_cutflow.loc[f"{filename}_{start}_{stop}"] = values_events
+        if "Sum" in elements_start.values:
+            df_cutflow = df_cutflow[:-2]
+            df_cutflow.loc[f"{filename}_{start}_{stop}"] = values_events
+            df_cutflow.loc[f"Sum over file"] = [np.sum(df_cutflow[name]) if name not in names_events[:3] else "Sum" for name in names_events]
+        elif "Sum" not in elements_start.values:
+            df_cutflow.loc[f"Sum over file"] = [np.sum(df_cutflow[name]) if name not in names_events[:3] else "Sum" for name in names_events]
+        
+        
+        df_cutflow.to_csv(f'{folder_save}/cutflow.csv', sep=',', encoding='utf-8', index=False)
         
         
         #df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True)
@@ -1667,13 +1892,13 @@ def res(mval, out):
 
         for var in lists_of_vars.keys():
             try:
-                else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy')
+                else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}__{start}_{stop}_full.npy')
             except FileNotFoundError:
                 else_var_array = np.array([])
             finally:
                 else_v_curr_array = np.array(lists_of_vars[var])
                 else_var_full_array = np.concatenate((else_var_array, else_v_curr_array), axis = None)
-                np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy', else_var_full_array, allow_pickle = False)
+                np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_{start}_{stop}_full.npy', else_var_full_array, allow_pickle = False)
             
         #df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True)
         

From 24e7e9585440b61bbe7b77b6b3fb000b055256f6 Mon Sep 17 00:00:00 2001
From: ValVau <109755950+ValVau@users.noreply.github.com>
Date: Thu, 9 Nov 2023 14:22:22 +0100
Subject: [PATCH 3/3] Updated xgb_files

---
 xgb_test_data_DATA_no_coffea_chi2.py          |  861 ++++++++++++
 xgb_test_no_coffea.py                         |  231 ++-
 xgb_test_no_coffea_chi2.py                    |  778 +++++++++++
 xgb_test_no_coffea_diff_bgs.py                |  776 +++++++++++
 xgb_test_no_coffea_diff_bgs_DATA.py           |  793 +++++++++++
 xgb_test_no_coffea_diff_bgs_DATA_scale.py     |  808 +++++++++++
 ...st_no_coffea_diff_bgs_DATA_scale_pandas.py | 1239 +++++++++++++++++
 ...a_diff_bgs_DATA_scale_pandas_numpy_test.py |  813 +++++++++++
 xgb_test_only_xgb_no_coffea.py                |   51 +-
 xgb_test_only_xgb_no_coffea_diff_bgs.py       |  416 ++++++
 xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py  |  417 ++++++
 ...st_only_xgb_no_coffea_diff_bgs_all etas.py |  530 +++++++
 ...only_xgb_no_coffea_diff_bgs_full_bg_set.py |  418 ++++++
 xgb_test_only_xgb_reloaded_no_coffea.py       |   22 +-
 xgb_test_only_xgb_reloaded_no_coffea_var.py   |   69 +-
 xgb_test_only_xgb_reloaded_no_coffea_vars.py  |  521 +++++++
 ...est_only_xgb_reloaded_no_coffea_vars_bg.py |  524 +++++++
 ..._xgb_reloaded_no_coffea_vars_bg_multibg.py |  525 +++++++
 ...d_no_coffea_vars_bg_multibg_full_bg_set.py |  589 ++++++++
 19 files changed, 10277 insertions(+), 104 deletions(-)
 create mode 100644 xgb_test_data_DATA_no_coffea_chi2.py
 create mode 100644 xgb_test_no_coffea_chi2.py
 create mode 100644 xgb_test_no_coffea_diff_bgs.py
 create mode 100644 xgb_test_no_coffea_diff_bgs_DATA.py
 create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale.py
 create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py
 create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py
 create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs.py
 create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py
 create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py
 create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py
 create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py

diff --git a/xgb_test_data_DATA_no_coffea_chi2.py b/xgb_test_data_DATA_no_coffea_chi2.py
new file mode 100644
index 0000000..b9dac73
--- /dev/null
+++ b/xgb_test_data_DATA_no_coffea_chi2.py
@@ -0,0 +1,861 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_08'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_scale")
+if not os.path.exists(f"./plot/{folder_save}/Big_scale"):
+    os.mkdir(f"./plot/{folder_save}/Big_scale")
+if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False)
+#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+data_path = 'condor_back_07_early/'
+#paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np_back)TTTo2L2Nu_vau_bg
+print(len(paths_np_back))
+df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_back_full_np)
+
+key_np_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_back[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np_back:
+            if f'{col}_{rois}' in path:
+                key_np_back[f'{col}_{rois}'].append(path)
+#print(key_np_back)
+for key in key_np_back.keys():
+    print(len(key_np_back[key]) == len(set(key_np_back[key])))
+    key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+    print(key)
+
+#print(key_np_back)
+
+max_length_back = 0
+key_np_full_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_full_back[f'{col}_{rois}'] = np.array([])
+for key in key_np_full_back.keys():
+    key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+    print(len(key_np_full_back[key]))
+    if max_length_back < len(key_np_full_back[key]):
+        max_length_back = len(key_np_full_back[key])
+#print(key_np_full_back)
+
+for key in key_np_full_back.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_back_full_np)
+df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+df_b_new_np = df_b_full_np.dropna()
+print(df_b_new_np)
+
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+    df_b_new_np['target'] = np.zeros(np.max(len_var))
+print(df_b_new_np)
+df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+data_path = 'condor_back_08_early/'
+datas = ["Run2017B_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau"] #"Run2017C_DoubleMu_vau"
+df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+for data in datas:
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data))
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        key_np_data[key] = [np.load(element) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]]
+    df_dat_new_np = df_dat_full_np.dropna()
+    print(df_dat_new_np)
+
+    len_var = []
+    for col in names_sig_data:
+        len_var.append(len(df_dat_new_np[f'{col}_{roi}']))
+        df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+    print(df_dat_new_np)
+    df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+
+df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']
+
+names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+c = 0
+
+df_hists = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+for col in names_sig_data[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+
+    data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}']))
+    df_hists[f'{col}_{roi}'] = np.array(counts22)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    #
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 50, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label= 'tt bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)),
+        linestyle = "None",
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised for sig/bg)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### Smaller scale ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%9 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%10 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### Larger scale #############################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%4 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%5 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### Smaller scale but not that small ################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%7 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%8 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg")
+    
+    c += 1
+    
+df_hists.to_csv(f'./plot/{folder_save}/hists_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+def gaussian(x, height, center, width, offset):
+    return height*np.exp(-(x-center)**2/(2*width**2)) + offset
+  
+def gaussiansin(x, height, center, width, offset, k, w):
+    return height*np.exp(-(x-center)**2/(2*width**2)) + offset + k*np.sin(x*w)
+  
+def chiq2_gauss(x,y,sig,N,a):
+    chiq1 = 0
+    for i in range(0,N):
+        chiq1 += ((y[i]-gaussian(x[i], a[0], a[1], a[2], a[3]))/sig[i])**2
+    chiq1 = chiq1/(N-4)
+    return chiq1
+  
+def chiq2_gausssin(x,y,sig,N,a):
+    chiq1 = 0
+    for i in range(0,N):
+        chiq1 += ((y[i]-gaussiansin(x[i], a[0], a[1], a[2], a[3], a[4], a[5]))/sig[i])**2
+    chiq1 = chiq1/(N-6)
+    return chiq1
+
+import scipy 
+counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80)
+
+from scipy.fft import fft, fftfreq
+from scipy import stats
+
+yf = fft(counts22)
+
+sampling_rate = 40
+
+xf = fftfreq(sampling_rate*2, 1/ sampling_rate)
+
+plt.figure(figsize = (13,8))
+plt.plot(xf, np.abs(yf))
+plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.pdf")
+plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.jpg")
+
+popt_s ,pcov_s = scipy.optimize.curve_fit(gaussiansin, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100, 1, 12])
+
+popt_g ,pcov_g = scipy.optimize.curve_fit(gaussian, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100])
+
+print("params gauss: ", popt_g)
+print("params gauss + sin : ", popt_s)
+
+print('\n Chi^2/dof of gauss sine is', chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s))
+print('\n Chi^2 of gauss sine is', 6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s))
+
+print('\n Chi^2/dof of gauss peak is', chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g))
+print('\n Chi^2 of gauss peak is', 4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g))
+
+p_val_sin = 1- stats.chi2.cdf(x=6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s), df=len(counts22)-6)
+p_val_gauss = 1- stats.chi2.cdf(x=4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g), df=len(counts22)-4)
+
+print("p-value of gauss is: ", p_val_gauss, 1-p_val_gauss)
+print("p-value of gauss + sin is: ", p_val_sin, 1- p_val_sin)
+## plot compare list
+def plot_data(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes):
+    xlin = np.linspace(0, 3.2)
+    # Plot measurements and fitted parabola
+    axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg')
+    axes[0].plot(xlin, gaussian(xlin, *params), color='red', label='Fitted gaussian')
+    axes[0].set_xlabel(x_label)
+    axes[0].set_xlim(0, 3.2)
+    axes[0].set_ylabel(y_label)
+    axes[0].set_ylim(ylims[0], ylims[1])
+    axes[0].legend()
+    axes[0].grid(True)
+    # Plot residuals
+    axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='')
+    axes[1].axhline(0, color='red', linestyle='--')
+    axes[1].set_xlabel(x_label)
+    axes[1].set_ylabel('Residuals')
+    axes[1].grid(True)
+    # Plot pulls
+    axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='')
+    axes[2].axhline(0, color='red', linestyle='--')
+    axes[2].set_xlabel(x_label)
+    axes[2].set_ylabel('Pulls')
+    axes[2].grid(True)
+    
+def plot_data_sin(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes):
+    xlin = np.linspace(0, 3.2)
+    # Plot measurements and fitted parabola
+    axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg')
+    axes[0].plot(xlin, gaussiansin(xlin, *params), color='red', label='Fitted gaussian + sin')
+    axes[0].set_xlabel(x_label)
+    axes[0].set_xlim(0, 3.2)
+    axes[0].set_ylabel(y_label)
+    axes[0].set_ylim(ylims[0], ylims[1])
+    axes[0].legend()
+    axes[0].grid(True)
+    # Plot residuals
+    axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='')
+    axes[1].axhline(0, color='red', linestyle='--')
+    axes[1].set_xlabel(x_label)
+    axes[1].set_ylabel('Residuals')
+    axes[1].grid(True)
+    # Plot pulls
+    axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='')
+    axes[2].axhline(0, color='red', linestyle='--')
+    axes[2].set_xlabel(x_label)
+    axes[2].set_ylabel('Pulls')
+    axes[2].grid(True)
+    
+error_count = np.sqrt(np.array(counts22))
+res_gauss = np.array(counts22) - gaussian(bins22[:-1], *popt_g)
+res_gauss_sin = np.array(counts22) - gaussiansin(bins22[:-1], *popt_s)
+
+pulls_gauss = res_gauss/error_count
+pulls_gauss_sin = res_gauss_sin/error_count
+pulls_err_gauss = np.sqrt(error_count**2)/error_count
+
+fig, axes = plt.subplots(3, 2, figsize=(10, 8), sharex=True)
+yAxisRange = [0, 400]
+# Plot the first column (existing data)
+plot_data(bins22[:-1], counts22, error_count, popt_g, res_gauss, error_count, pulls_gauss, pulls_err_gauss, 'x', 'y', yAxisRange, axes[:, 0])
+# Plot the second column (strange data)
+plot_data_sin(bins22[:-1], counts22, error_count, popt_s, res_gauss_sin, error_count, pulls_gauss_sin, pulls_err_gauss, 'x', 'y (+sin)', yAxisRange, axes[:, 1])
+# Adjust spacing between subplots
+fig.subplots_adjust(hspace=0)
+fig.subplots_adjust(wspace=0.3)
+#plt.show()
+
+fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.pdf")
+fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.jpg")
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea.py b/xgb_test_no_coffea.py
index 16547aa..f16bc8d 100644
--- a/xgb_test_no_coffea.py
+++ b/xgb_test_no_coffea.py
@@ -12,9 +12,15 @@
    
 )
 net_path = "/net/scratch_cms3a/vaulin/"
-folder_save = 'eval_23_04_19_later'
+folder_save = 'eval_23_08_02'
 if not os.path.exists(f"./plot/{folder_save}"):
     os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_scale")
+if not os.path.exists(f"./plot/{folder_save}/Big_scale"):
+    os.mkdir(f"./plot/{folder_save}/Big_scale")
+if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale")
 if not os.path.exists(net_path + f"plot/{folder_save}"):
     os.mkdir(net_path + f"plot/{folder_save}")
 def autoranger(array):
@@ -35,11 +41,12 @@ def autoranger(array):
                  'del_phi_l2_subleading', 'del_phi_l2_leading'] 
 
 roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
-roi = 'low_mumu'
+roi = 'high_mumu'
 ######################################################################################
 ##### Read np arrays of signal sample ################################################
 ######################################################################################
-paths_np = [str(x) for x in Path("./condor_signal_04_mid/ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
 #print(paths_np)
 print(len(paths_np))
 df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
@@ -54,23 +61,38 @@ def autoranger(array):
         for path in paths_np:
             if f'{col}_{rois}' in path:
                 key_np[f'{col}_{rois}'].append(path)
-#print(key_np)
+
 for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
     key_np[key] = [np.load(element) for element in key_np[key]]
-#print(key_np)
+    #print(key)
+    
+print(key_np)
 
 key_np_full = {}
+max_length = 0
 for col in names_sig:
     for rois in roiis:
         key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
 for key in key_np_full.keys():
     key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
-#print(key_np_full)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
 
 for key in key_np_full.keys():                 
-    df_sig_full_np[key] = pd.Series(key_np_full[key])
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
 print(df_sig_full_np)
 df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
 df_s_new_np = df_s_new_np.dropna()
 print(df_s_new_np)
 len_var = []
@@ -78,14 +100,21 @@ def autoranger(array):
     len_var.append(len(df_s_new_np[f'{col}_{roi}']))
     df_s_new_np['target'] = np.ones(np.max(len_var))
 print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False)
+#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
 ######################################################################################
 
 
 ######################################################################################
 ##### Read np arrays of background sample ############################################
 ######################################################################################
-paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
-#print(paths_np_back)
+data_path = 'condor_back_07_early/'
+paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np_back)TTTo2L2Nu_vau_bg
 print(len(paths_np_back))
 df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
 print(df_back_full_np)
@@ -101,22 +130,31 @@ def autoranger(array):
                 key_np_back[f'{col}_{rois}'].append(path)
 #print(key_np_back)
 for key in key_np_back.keys():
+    print(len(key_np_back[key]) == len(set(key_np_back[key])))
     key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+    print(key)
+
 #print(key_np_back)
 
+max_length_back = 0
 key_np_full_back = {}
 for col in names_sig:
     for rois in roiis:
         key_np_full_back[f'{col}_{rois}'] = np.array([])
 for key in key_np_full_back.keys():
     key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+    print(len(key_np_full_back[key]))
+    if max_length_back < len(key_np_full_back[key]):
+        max_length_back = len(key_np_full_back[key])
 #print(key_np_full_back)
 
 for key in key_np_full_back.keys():                 
-    df_back_full_np[key] = pd.Series(key_np_full_back[key])
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
 print(df_back_full_np)
-df_b_new_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
-df_b_new_np = df_b_new_np.dropna()
+df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+df_b_new_np = df_b_full_np.dropna()
 print(df_b_new_np)
 
 len_var = []
@@ -124,9 +162,9 @@ def autoranger(array):
     len_var.append(len(df_b_new_np[f'{col}_{roi}']))
     df_b_new_np['target'] = np.zeros(np.max(len_var))
 print(df_b_new_np)
-
+df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False)
 ######################################################################################
-
+folder_save = 'eval_23_08_02'
 df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True)
 print(df)
 print(df.info())
@@ -142,6 +180,8 @@ def autoranger(array):
                  '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
 
 c = 0
+
+
 for col in names_sig[1:]:
     
     plt.figure(figsize=(10,10))
@@ -170,9 +210,7 @@ def autoranger(array):
     plt.title(f'{names_sig_updated[c]}_low_ee')
     plt.legend(['Signal', 'Background'])
     #plt.show()
-    plt.savefig(net_path +f"plot/{folder_save}/{col}_{roi}.jpg")
-
-
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
 
     fig, ((ax), (rax)) = plt.subplots(
         2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
@@ -220,7 +258,8 @@ def autoranger(array):
     counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
     counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
     ratio = np.divide(counts1, counts2, where = (counts2 != 0))
-    plt.plot(bins1[:-1], ratio, 'ko')
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
     plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
     
     
@@ -255,31 +294,31 @@ def autoranger(array):
     if "norm" in config.keys() and config["norm"]:
         logext = "_norm" + logext
     '''
-    fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.pdf")
-    fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.jpg")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
     
     ######################################################################################################
-    #### No rescaling ####################################################################################
+    #### Smaller scale ####################################################################################
     ######################################################################################################
     fig, ((ax), (rax)) = plt.subplots(
         2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
     )
     fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
     hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
 
-    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
-    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160)
     ## plot reference
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])),
         label= 'Higgs -> cc',
         histtype="step",
         color='r',
         yerr=True,
         ax=ax,
-        
+        density = True,
     )
     for i in range(0, len(bins2)-1):
         x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
@@ -288,26 +327,27 @@ def autoranger(array):
         x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
         y_pos = counts2[i] + (counts2[i] * 0.01)
         label_p = str(counts22[i])
-        if i%5 == 0:
+        if i%9 == 0:
             ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
-        if i%6 == 0:
+        if i%10 == 0:
             ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
     ## plot compare list
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])),
         label='DY bg',
         histtype="step",
         color='g',
         yerr=True,
         ax=ax,
-        
+        density = True,
         )
     # plot ratio of com/Ref
     
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
     ratio = np.divide(counts1, counts2, where = (counts2 != 0))
-    plt.plot(bins1[:-1], ratio, 'ko')
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
     plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
     
     
@@ -342,31 +382,31 @@ def autoranger(array):
     if "norm" in config.keys() and config["norm"]:
         logext = "_norm" + logext
     '''
-    fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.pdf")
-    fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.jpg")
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg")
 
     ######################################################################################################
-    #### No rescaling  hist density ######################################################################
+    #### Larger scale #############################################################################
     ######################################################################################################
     fig, ((ax), (rax)) = plt.subplots(
         2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
     )
     fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
     hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
 
-    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
-    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40)
     ## plot reference
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])),
         label= 'Higgs -> cc',
         histtype="step",
         color='r',
         yerr=True,
         ax=ax,
-        
+        density = True,
     )
     for i in range(0, len(bins2)-1):
         x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
@@ -375,26 +415,27 @@ def autoranger(array):
         x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
         y_pos = counts2[i] + (counts2[i] * 0.01)
         label_p = str(counts22[i])
-        if i%5 == 0:
+        if i%4 == 0:
             ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
-        if i%6 == 0:
+        if i%5 == 0:
             ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
     ## plot compare list
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])),
         label='DY bg',
         histtype="step",
         color='g',
         yerr=True,
         ax=ax,
-        
+        density = True,
         )
     # plot ratio of com/Ref
     
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
     ratio = np.divide(counts1, counts2, where = (counts2 != 0))
-    plt.plot(bins1[:-1], ratio, 'ko')
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
     plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
     
     
@@ -429,25 +470,25 @@ def autoranger(array):
     if "norm" in config.keys() and config["norm"]:
         logext = "_norm" + logext
     '''
-    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.pdf")
-    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.jpg")
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg")
 
     ######################################################################################################
-    #### No rescaling  hist density True #################################################################
+    #### Smaller scale but not that small ################################################################
     ######################################################################################################
     fig, ((ax), (rax)) = plt.subplots(
         2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
     )
     fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
     hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
 
-    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
-    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120)
     ## plot reference
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])),
         label= 'Higgs -> cc',
         histtype="step",
         color='r',
@@ -462,13 +503,13 @@ def autoranger(array):
         x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
         y_pos = counts2[i] + (counts2[i] * 0.01)
         label_p = str(counts22[i])
-        if i%5 == 0:
+        if i%7 == 0:
             ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
-        if i%6 == 0:
+        if i%8 == 0:
             ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
     ## plot compare list
     hep.histplot(
-        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])),
         label='DY bg',
         histtype="step",
         color='g',
@@ -478,10 +519,11 @@ def autoranger(array):
         )
     # plot ratio of com/Ref
     
-    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
-    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
     ratio = np.divide(counts1, counts2, where = (counts2 != 0))
-    plt.plot(bins1[:-1], ratio, 'ko')
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
     plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
     
     
@@ -505,6 +547,7 @@ def autoranger(array):
     hep.mpl_magic(ax=ax)
     ax.set_ylim(bottom=0)
 
+
     logext = ""
     '''
     # log y axis
@@ -516,10 +559,62 @@ def autoranger(array):
     if "norm" in config.keys() and config["norm"]:
         logext = "_norm" + logext
     '''
-    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.pdf")
-    fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.jpg")
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg")
     
     c += 1
+    
+    
+fig, ((ax), (rax)) = plt.subplots(
+2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+)
+fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80)
+## plot compare list
+hep.histplot(
+        np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins =80),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = False,        )
+    # plot ratio of com/Ref
+    
+counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+#ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+#sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+#plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+#plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+##  plot settings, adjust range
+rax.set_xlabel(f'$\Delta\Phi(j1, j2)$ {roi}')
+ax.set_xlabel(None)
+ax.set_ylabel("Events (normalised)")
+rax.set_ylabel('$\\frac{Signal}{Background}$')
+ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+ax.legend()
+rax.set_ylim(0.0, 2.0)
+xmin, xmax, maxval, minval = autoranger(np.array(df[f'del_phi_jj_{roi}'][:len_sig]))
+rax.set_xlim(minval, maxval)
+at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+ax.add_artist(at)
+hep.mpl_magic(ax=ax)
+ax.set_ylim(bottom=0)
+
+logext = ""
+    
+fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf")
+fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg")
 
 X = df.drop("target", axis = 1)
 print(X)
@@ -594,12 +689,12 @@ def autoranger(array):
 plt.title('Importance plot')
 plt.legend([''])
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
 
 plt.figure(figsize=(17,12))
 plot_tree(xgb_cl, fmap = 'feature_map.txt')
 plt.title('Decision tree graph')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()
diff --git a/xgb_test_no_coffea_chi2.py b/xgb_test_no_coffea_chi2.py
new file mode 100644
index 0000000..3a89590
--- /dev/null
+++ b/xgb_test_no_coffea_chi2.py
@@ -0,0 +1,778 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_06_tt'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_scale")
+if not os.path.exists(f"./plot/{folder_save}/Big_scale"):
+    os.mkdir(f"./plot/{folder_save}/Big_scale")
+if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"):
+    os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'high_ee'
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False)
+#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+data_path = 'condor_back_07_early/'
+#paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np_back)TTTo2L2Nu_vau_bg
+print(len(paths_np_back))
+df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_back_full_np)
+
+key_np_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_back[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np_back:
+            if f'{col}_{rois}' in path:
+                key_np_back[f'{col}_{rois}'].append(path)
+#print(key_np_back)
+for key in key_np_back.keys():
+    print(len(key_np_back[key]) == len(set(key_np_back[key])))
+    key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+    print(key)
+
+#print(key_np_back)
+
+max_length_back = 0
+key_np_full_back = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np_full_back[f'{col}_{rois}'] = np.array([])
+for key in key_np_full_back.keys():
+    key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+    print(len(key_np_full_back[key]))
+    if max_length_back < len(key_np_full_back[key]):
+        max_length_back = len(key_np_full_back[key])
+#print(key_np_full_back)
+
+for key in key_np_full_back.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_back_full_np)
+df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+df_b_new_np = df_b_full_np.dropna()
+print(df_b_new_np)
+
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+    df_b_new_np['target'] = np.zeros(np.max(len_var))
+print(df_b_new_np)
+df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+c = 0
+
+df_hists = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+for col in names_sig[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    df_hists[f'{col}_{roi}'] = np.array(counts22)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### Smaller scale ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%9 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%10 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### Larger scale #############################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%4 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%5 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### Smaller scale but not that small ################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%7 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%8 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0))
+    plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg")
+    
+    c += 1
+    
+df_hists.to_csv(f'./plot/{folder_save}/hists_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+def gaussian(x, height, center, width, offset):
+    return height*np.exp(-(x-center)**2/(2*width**2)) + offset
+  
+def gaussiansin(x, height, center, width, offset, k, w):
+    return height*np.exp(-(x-center)**2/(2*width**2)) + offset + k*np.sin(x*w)
+  
+def chiq2_gauss(x,y,sig,N,a):
+    chiq1 = 0
+    for i in range(0,N):
+        chiq1 += ((y[i]-gaussian(x[i], a[0], a[1], a[2], a[3]))/sig[i])**2
+    chiq1 = chiq1/(N-4)
+    return chiq1
+  
+def chiq2_gausssin(x,y,sig,N,a):
+    chiq1 = 0
+    for i in range(0,N):
+        chiq1 += ((y[i]-gaussiansin(x[i], a[0], a[1], a[2], a[3], a[4], a[5]))/sig[i])**2
+    chiq1 = chiq1/(N-6)
+    return chiq1
+
+import scipy 
+counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80)
+
+from scipy.fft import fft, fftfreq
+from scipy import stats
+
+yf = fft(counts22)
+
+sampling_rate = 40
+
+xf = fftfreq(sampling_rate*2, 1/ sampling_rate)
+
+plt.figure(figsize = (13,8))
+plt.plot(xf, np.abs(yf))
+plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.pdf")
+plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.jpg")
+
+popt_s ,pcov_s = scipy.optimize.curve_fit(gaussiansin, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100, 1, 12])
+
+popt_g ,pcov_g = scipy.optimize.curve_fit(gaussian, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100])
+
+print("params gauss: ", popt_g)
+print("params gauss + sin : ", popt_s)
+
+print('\n Chi^2/dof of gauss sine is', chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s))
+print('\n Chi^2 of gauss sine is', 6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s))
+
+print('\n Chi^2/dof of gauss peak is', chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g))
+print('\n Chi^2 of gauss peak is', 4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g))
+
+p_val_sin = 1- stats.chi2.cdf(x=6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s), df=len(counts22)-6)
+p_val_gauss = 1- stats.chi2.cdf(x=4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g), df=len(counts22)-4)
+
+print("p-value of gauss is: ", p_val_gauss, 1-p_val_gauss)
+print("p-value of gauss + sin is: ", p_val_sin, 1- p_val_sin)
+## plot compare list
+def plot_data(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes):
+    xlin = np.linspace(0, 3.2)
+    # Plot measurements and fitted parabola
+    axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg')
+    axes[0].plot(xlin, gaussian(xlin, *params), color='red', label='Fitted gaussian')
+    axes[0].set_xlabel(x_label)
+    axes[0].set_xlim(0, 3.2)
+    axes[0].set_ylabel(y_label)
+    axes[0].set_ylim(ylims[0], ylims[1])
+    axes[0].legend()
+    axes[0].grid(True)
+    # Plot residuals
+    axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='')
+    axes[1].axhline(0, color='red', linestyle='--')
+    axes[1].set_xlabel(x_label)
+    axes[1].set_ylabel('Residuals')
+    axes[1].grid(True)
+    # Plot pulls
+    axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='')
+    axes[2].axhline(0, color='red', linestyle='--')
+    axes[2].set_xlabel(x_label)
+    axes[2].set_ylabel('Pulls')
+    axes[2].grid(True)
+    
+def plot_data_sin(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes):
+    xlin = np.linspace(0, 3.2)
+    # Plot measurements and fitted parabola
+    axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg')
+    axes[0].plot(xlin, gaussiansin(xlin, *params), color='red', label='Fitted gaussian + sin')
+    axes[0].set_xlabel(x_label)
+    axes[0].set_xlim(0, 3.2)
+    axes[0].set_ylabel(y_label)
+    axes[0].set_ylim(ylims[0], ylims[1])
+    axes[0].legend()
+    axes[0].grid(True)
+    # Plot residuals
+    axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='')
+    axes[1].axhline(0, color='red', linestyle='--')
+    axes[1].set_xlabel(x_label)
+    axes[1].set_ylabel('Residuals')
+    axes[1].grid(True)
+    # Plot pulls
+    axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='')
+    axes[2].axhline(0, color='red', linestyle='--')
+    axes[2].set_xlabel(x_label)
+    axes[2].set_ylabel('Pulls')
+    axes[2].grid(True)
+    
+error_count = np.sqrt(np.array(counts22))
+res_gauss = np.array(counts22) - gaussian(bins22[:-1], *popt_g)
+res_gauss_sin = np.array(counts22) - gaussiansin(bins22[:-1], *popt_s)
+
+pulls_gauss = res_gauss/error_count
+pulls_gauss_sin = res_gauss_sin/error_count
+pulls_err_gauss = np.sqrt(error_count**2)/error_count
+
+fig, axes = plt.subplots(3, 2, figsize=(10, 8), sharex=True)
+yAxisRange = [0, 400]
+# Plot the first column (existing data)
+plot_data(bins22[:-1], counts22, error_count, popt_g, res_gauss, error_count, pulls_gauss, pulls_err_gauss, 'x', 'y', yAxisRange, axes[:, 0])
+# Plot the second column (strange data)
+plot_data_sin(bins22[:-1], counts22, error_count, popt_s, res_gauss_sin, error_count, pulls_gauss_sin, pulls_err_gauss, 'x', 'y (+sin)', yAxisRange, axes[:, 1])
+# Adjust spacing between subplots
+fig.subplots_adjust(hspace=0)
+fig.subplots_adjust(wspace=0.3)
+#plt.show()
+
+fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.pdf")
+fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.jpg")
+
+X = df.drop("target", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea_diff_bgs.py b/xgb_test_no_coffea_diff_bgs.py
new file mode 100644
index 0000000..04b661a
--- /dev/null
+++ b/xgb_test_no_coffea_diff_bgs.py
@@ -0,0 +1,776 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_07_25_2'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/No_dense"):
+    os.mkdir(f"./plot/{folder_save}/No_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense_True")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+#roi = 'low_ee'
+
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+    df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False)
+#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+data_path = 'condor_back_07_early/'
+def bg_processor(bg, nr):
+    paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] 
+    #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+    #print(paths_np_back)
+    print(len(paths_np_back))
+    df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+
+    key_np_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_back[f'{col}_{rois}'] = []
+    for col in names_sig:
+        for rois in roiis:
+            for path in paths_np_back:
+                if f'{col}_{rois}' in path:
+                    key_np_back[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_back.keys():
+        print(len(key_np_back[key]) == len(set(key_np_back[key])))
+        key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_back = 0
+    key_np_full_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_full_back[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_back.keys():
+        key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+        print(len(key_np_full_back[key]))
+        if max_length_back < len(key_np_full_back[key]):
+            max_length_back = len(key_np_full_back[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_back.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+    df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+    df_b_new_np = df_b_full_np.dropna()
+    print(df_b_new_np)
+
+    len_var = []
+    for col in names_sig:
+        len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+        df_b_new_np['target'] = np.zeros(np.max(len_var))
+        df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var))
+    print(df_b_new_np)
+    df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False)
+    return df_b_new_np, len(df_b_new_np['target'])
+
+df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1)
+df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2)
+df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3)
+df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4)
+df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5)
+max_len_bg = 0
+for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]:
+    if max_len_bg < l:
+        max_len_bg = l
+        
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+data_path = 'condor_back_08_early/'
+datas = ["Run2017B_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau"] #"Run2017C_DoubleMu_vau"
+df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+for data in datas:
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data))
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        key_np_data[key] = [np.load(element) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]]
+    df_dat_new_np = df_dat_full_np.dropna()
+    print(df_dat_new_np)
+
+    len_var = []
+    for col in names_sig_data:
+        len_var.append(len(df_dat_new_np[f'{col}_{roi}']))
+        df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+    print(df_dat_new_np)
+    df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+######################################################################################
+#folder_save = 'eval_23_07_25_2'
+df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_wz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_tt], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+#xsec_weights = [0.002342, 6077., 3.74, 6.419, 88.51, 0.00720]
+
+xsec_weights = [1 , 1, 1, 1, 1, 1]
+
+#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8')
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+
+c = 0
+for col in names_sig[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'ZH -> cc signal',
+        histtype="step",
+        color='r',
+        #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]),
+        yerr = True,
+        ax=ax,
+        density = True,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = True,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True)
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True)
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True)
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True)
+    
+    ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0))
+    ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0))
+    ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0))
+    ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0))
+    ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0))
+    rax.plot(bins1[:-1], ratio_dy, 'go')
+    rax.plot(bins1[:-1], ratio_zz, 'yo')
+    rax.plot(bins1[:-1], ratio_wz, 'bo')
+    rax.plot(bins1[:-1], ratio_tt, 'mo')
+    rax.plot(bins1[:-1], ratio_zhtobb, 'co')
+    rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 4.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea_diff_bgs_DATA.py b/xgb_test_no_coffea_diff_bgs_DATA.py
new file mode 100644
index 0000000..d0325c0
--- /dev/null
+++ b/xgb_test_no_coffea_diff_bgs_DATA.py
@@ -0,0 +1,793 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_22'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/No_dense"):
+    os.mkdir(f"./plot/{folder_save}/No_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense_True")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+#roi = 'low_ee'
+
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+    df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False)
+#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+data_path = 'condor_back_07_early/'
+def bg_processor(bg, nr):
+    paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] 
+    #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+    #print(paths_np_back)
+    print(len(paths_np_back))
+    df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+
+    key_np_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_back[f'{col}_{rois}'] = []
+    for col in names_sig:
+        for rois in roiis:
+            for path in paths_np_back:
+                if f'{col}_{rois}' in path:
+                    key_np_back[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_back.keys():
+        print(len(key_np_back[key]) == len(set(key_np_back[key])))
+        key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_back = 0
+    key_np_full_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_full_back[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_back.keys():
+        key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+        print(len(key_np_full_back[key]))
+        if max_length_back < len(key_np_full_back[key]):
+            max_length_back = len(key_np_full_back[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_back.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+    df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+    df_b_new_np = df_b_full_np.dropna()
+    print(df_b_new_np)
+
+    len_var = []
+    for col in names_sig:
+        len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+        df_b_new_np['target'] = np.zeros(np.max(len_var))
+        df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var))
+    print(df_b_new_np)
+    df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False)
+    return df_b_new_np, len(df_b_new_np['target'])
+
+df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1)
+df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2)
+df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3)
+df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4)
+df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5)
+max_len_bg = 0
+for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]:
+    if max_len_bg < l:
+        max_len_bg = l
+        
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+data_path = 'condor_back_08_early/'
+datas = ["Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau",
+	 "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017E_DoubleEG_vau", "Run2017F_DoubleEG_vau"] #"Run2017C_DoubleMu_vau"
+df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+for data in datas:
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data))
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        key_np_data[key] = [np.load(element) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]]
+    df_dat_new_np = df_dat_full_np.dropna()
+    print(df_dat_new_np)
+
+    len_var = []
+    for col in names_sig_data:
+        len_var.append(len(df_dat_new_np[f'{col}_{roi}']))
+        df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+    print(df_dat_new_np)
+    df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+######################################################################################
+#folder_save = 'eval_23_07_25_2'
+df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_wz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_tt], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+xsec_weights = [0.002342, 6077., 3.74, 6.419, 88.51, 0.00720]
+
+#xsec_weights = [1 , 1, 1, 1, 1, 1]
+
+#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8')
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+
+c = 0
+for col in names_sig_data[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}']))
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'ZH -> cc signal',
+        histtype="step",
+        color='r',
+        #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]),
+        yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True)
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True)
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True)
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True)
+    
+    ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0))
+    ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0))
+    ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0))
+    ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0))
+    ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0))
+    rax.plot(bins1[:-1], ratio_dy, 'go')
+    rax.plot(bins1[:-1], ratio_zz, 'yo')
+    rax.plot(bins1[:-1], ratio_wz, 'bo')
+    rax.plot(bins1[:-1], ratio_tt, 'mo')
+    rax.plot(bins1[:-1], ratio_zhtobb, 'co')
+    rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 4.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale.py b/xgb_test_no_coffea_diff_bgs_DATA_scale.py
new file mode 100644
index 0000000..847ec1d
--- /dev/null
+++ b/xgb_test_no_coffea_diff_bgs_DATA_scale.py
@@ -0,0 +1,808 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_09_14'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/No_dense"):
+    os.mkdir(f"./plot/{folder_save}/No_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense_True")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+#roi = 'low_ee'
+
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+data_path = 'condor_signal_09_late/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+
+for r in roiis:
+    df_s_new_np = df_sig_full_np[[f'{col}_{r}' for col in names_sig]]
+
+    
+    df_s_new_np = df_s_new_np.dropna()
+    print(df_s_new_np)
+    len_var = []
+    for col in names_sig:
+        len_var.append(len(df_s_new_np[f'{col}_{r}']))
+        df_s_new_np['target'] = np.ones(np.max(len_var))
+        df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+        df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{r}.csv', sep=',', encoding='utf-8', index=False)
+print(df_s_new_np)
+df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8')
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+'''len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+    df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False)
+'''#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+data_path = 'condor_back_09_late/'
+def bg_processor(bg, nr):
+    paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] 
+    #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+    #print(paths_np_back)
+    print(len(paths_np_back))
+    df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+
+    key_np_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_back[f'{col}_{rois}'] = []
+    for col in names_sig:
+        for rois in roiis:
+            for path in paths_np_back:
+                if f'{col}_{rois}' in path:
+                    key_np_back[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_back.keys():
+        print(len(key_np_back[key]) == len(set(key_np_back[key])))
+        key_np_back[key] = [np.load(element, allow_pickle = True) for element in key_np_back[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_back = 0
+    key_np_full_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_full_back[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_back.keys():
+        key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+        print(len(key_np_full_back[key]))
+        if max_length_back < len(key_np_full_back[key]):
+            max_length_back = len(key_np_full_back[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_back.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+    
+    
+    for r in roiis:
+        df_b_full_np = df_back_full_np[[f'{col}_{r}' for col in names_sig]]
+        df_b_new_np = df_b_full_np.dropna()
+        print(df_b_new_np)
+        len_var = []
+        for col in names_sig:
+            len_var.append(len(df_b_new_np[f'{col}_{r}']))
+            df_b_new_np['target'] = np.zeros(np.max(len_var))
+            df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var))
+        print(df_b_new_np)
+        df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{r}.csv', sep=',', encoding='utf-8', index=False)
+    df_b_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8')
+    return df_b_new_np, len(df_b_new_np['target'])
+
+df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1)
+df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2)
+df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3)
+df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4)
+df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg", 5)
+max_len_bg = 0
+for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]:
+    if max_len_bg < l:
+        max_len_bg = l
+        
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+data_path = 'condor_back_09_late/'
+datas = ["Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau",
+	 "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017E_DoubleEG_vau", "Run2017F_DoubleEG_vau"] #"Run2017C_DoubleMu_vau"
+df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+for data in datas:
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data))
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        key_np_data[key] = [np.load(element) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    
+    for r in roiis:
+        df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]]
+        df_dat_new_np = df_dat_full_np.dropna()
+        print(df_dat_new_np) 
+        len_var = []
+        for col in names_sig_data:
+            len_var.append(len(df_dat_new_np[f'{col}_{r}']))
+            df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+        print(df_dat_new_np)
+        df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+        df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}.csv', sep=',', encoding='utf-8', index=False)
+df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}.csv', sep=',', encoding='utf-8')
+######################################################################################
+######################################################################################
+#folder_save = 'eval_23_07_25_2'
+df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_wz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_tt], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+xsec_weights = [0.002342*(41480/3323082), 6077.*(41480/102863931), 3.74*(41480/19134840), 
+		6.419*(41480/18136498), 88.51*(41480/105859990), 0.00720*(41480/4337504)]
+
+#xsec_weights = [1 , 1, 1, 1, 1, 1]
+
+#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8')
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+
+c = 0
+for col in names_sig_data[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}']))
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'ZH -> cc signal',
+        histtype="step",
+        color='r',
+        #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]),
+        yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True)
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True)
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True)
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True)
+    
+    ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0))
+    ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0))
+    ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0))
+    ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0))
+    ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0))
+    rax.plot(bins1[:-1], ratio_dy, 'go')
+    rax.plot(bins1[:-1], ratio_zz, 'yo')
+    rax.plot(bins1[:-1], ratio_wz, 'bo')
+    rax.plot(bins1[:-1], ratio_tt, 'mo')
+    rax.plot(bins1[:-1], ratio_zhtobb, 'co')
+    rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 4.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py
new file mode 100644
index 0000000..7218d1d
--- /dev/null
+++ b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py
@@ -0,0 +1,1239 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_10_16'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/No_dense"):
+    os.mkdir(f"./plot/{folder_save}/No_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense_True")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+# ", high $p_t$ $Z \\rightarrow ee$"
+# ", high $p_t$ $Z \\rightarrow \\mu\\mu$"
+# ", low $p_t$ $Z \\rightarrow ee$"
+# ", low $p_t$ $Z \\rightarrow \\mu\\mu$"
+roi = 'low_mumu'
+roi_latex = ", low $p_t$ $Z \\rightarrow \\mu\\mu$"
+#roi = 'low_ee'
+
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+'''
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+    df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False)
+'''
+df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+'''
+data_path = 'condor_back_07_early/'
+def bg_processor(bg, nr):
+    paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] 
+    #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+    #print(paths_np_back)
+    print(len(paths_np_back))
+    df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+
+    key_np_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_back[f'{col}_{rois}'] = []
+    for col in names_sig:
+        for rois in roiis:
+            for path in paths_np_back:
+                if f'{col}_{rois}' in path:
+                    key_np_back[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_back.keys():
+        print(len(key_np_back[key]) == len(set(key_np_back[key])))
+        key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_back = 0
+    key_np_full_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_full_back[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_back.keys():
+        key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+        print(len(key_np_full_back[key]))
+        if max_length_back < len(key_np_full_back[key]):
+            max_length_back = len(key_np_full_back[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_back.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+    df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+    df_b_new_np = df_b_full_np.dropna()
+    print(df_b_new_np)
+
+    len_var = []
+    for col in names_sig:
+        len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+        df_b_new_np['target'] = np.zeros(np.max(len_var))
+        df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var))
+    print(df_b_new_np)
+    df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False)
+    return df_b_new_np, len(df_b_new_np['target'])
+
+'''
+#df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1)
+#df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2)
+#df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3)
+#df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4)
+#df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5)
+
+df_b_new_np_dy, len_dy = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_zz, len_zz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_wz, len_wz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_tt, len_tt = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_zhtobb, len_zhtobb = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+
+max_len_bg = 0
+for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]:
+    if max_len_bg < l:
+        max_len_bg = l
+        
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+
+data_path = 'condor_back_09_late/'
+datas = ["Run2017F_DoubleMu_vau", "Run2017F_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", 
+	 "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017E_DoubleEG_vau"] 
+'''
+for data in datas:
+    
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data), data)
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        key_np_data[key] = [np.load(element, allow_pickle = True) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    for r in roiis:
+        df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+        df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]]
+        df_dat_new_np = df_dat_full_np.dropna()
+        print(df_dat_new_np) 
+        len_var = []
+        for col in names_sig_data:
+            len_var.append(len(df_dat_new_np[f'{col}_{r}']))
+            df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+        print(df_dat_new_np)
+        df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+        df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}_{data}.csv', sep=',', encoding='utf-8', index=False)
+'''
+df_data_final = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+for data in datas:
+    df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}_{data}.csv', sep=',', encoding='utf-8')
+    df_data_final = pd.concat([df_data_final, df_data], ignore_index = True)
+print(df_data_final)
+df_data_final.to_csv(f'./plot/{folder_save}/numpy_data_DATA_final_{roi}.csv', sep=',', encoding='utf-8', index=False)
+######################################################################################
+######################################################################################
+#folder_save = 'eval_23_07_25_2'
+df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_wz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_tt], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+strange_factor = 1.26 ###low_channel
+#strange_factor = 1.22 ###high_channel
+lumi = 41480*strange_factor
+#lumi = 49810
+xsec_weights = [0.002342*(lumi/3323082), 6077.*(lumi/102863931), 3.74*(lumi/19134840), 
+		6.419*(lumi/18136498), 88.51*(lumi/105859990), 0.00720*(lumi/4337504)]
+print(xsec_weights)
+#xsec_weights = [1 , 1, 1, 1, 1, 1]
+
+#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8')
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+names_sig_updated_data = ['m(H) [GeV]', '$p_t$(H) [GeV]', '$p_t$(Z) [GeV]', 'm(Z) [GeV]', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet [GeV]', '$p_t$ of $CvsL_{min}$ jet [GeV]',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+import scipy 
+c = 0
+for col in names_sig_data[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax)
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    print(f'{col}_{roi}', len_sig, len_dy)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}']))
+    
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'ZH -> cc signal',
+        histtype="step",
+        color='r',
+        #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]),
+        yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+        # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True)
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True)
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True)
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True)
+    
+    ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0))
+    ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0))
+    ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0))
+    ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0))
+    ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0))
+    rax.plot(bins1[:-1], ratio_dy, 'go')
+    rax.plot(bins1[:-1], ratio_zz, 'yo')
+    rax.plot(bins1[:-1], ratio_wz, 'bo')
+    rax.plot(bins1[:-1], ratio_tt, 'mo')
+    rax.plot(bins1[:-1], ratio_zhtobb, 'co')
+    rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 4.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax)
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    print(f'{col}_{roi}', len_sig, len_dy)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}']))
+    counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal',
+        label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)',
+        histtype="step",
+        color='r',
+        yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig),
+        #yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins)
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins,  weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1]
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    #print(np.sqrt(counts2))
+
+     
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    
+    counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2]
+    
+    
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+
+    counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+    unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3]
+    
+    
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+
+    counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+    unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4]
+
+    
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    
+    counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5]
+
+
+    sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6
+    mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]]
+    unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2)
+    unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2)
+    
+    print('rest')
+    print( data_counts, sum_c, unc_sum)
+    #print(data_counts - sum_c)
+    #print((data_counts - sum_c)/sum_c)
+    ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0))
+    unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2)
+    rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    #rax.plot(bins[:-1], ratio, 'ko')
+    rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black')
+    
+    def line(x, a, b):
+      return a*x + b
+    if col == 'Higgs_mass':
+        popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3])
+        print(popt, np.sqrt(pcov))
+        print(1+popt[1])
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Data - MC}{MC}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(-0.4, 1.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(12, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax)
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    print(f'{col}_{roi}', len_sig, len_dy)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    if col == 'Higgs_mass':
+        bins = np.array([0 + i*((250)/15) for i in range(0, 16)])
+    elif col == 'Higgs_pt':
+        bins = np.array([0 + i*((300)/20) for i in range(0, 21)])
+    elif col == 'Z_pt':
+        bins = np.array([50 + i*((100)/50) for i in range(0, 51)])
+    elif col == 'Z_mass':
+        bins = np.array([75 + i*((30)/50) for i in range(0, 51)])
+    elif col == 'jjVptratio':
+        bins = np.array([0 + i*((2)/15) for i in range(0, 16)])
+    elif col == 'CvsL_max':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsL_min':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsB_max':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsB_min':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'pt_lead':
+        bins = np.array([0 + i*((350)/25) for i in range(0, 26)])
+    elif col == 'pt_sublead':
+        bins = np.array([0 + i*((350)/25) for i in range(0, 26)])
+    elif col == 'del_phi_jjV':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_R_jj':
+        bins = np.array([0 + i*((5)/15) for i in range(0, 16)])
+    elif col == 'del_eta_jj':
+        bins = np.array([0 + i*((3)/15) for i in range(0, 16)])
+    elif col == 'del_phi_ll':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_eta_ll':
+        bins = np.array([0 + i*((2.6)/15) for i in range(0, 16)])
+    elif col == 'del_phi_l2_leading':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_phi_l2_subleading':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}']))
+    counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal',
+        label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)',
+        histtype="step",
+        color='r',
+        yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig),
+        #yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    if col == 'Higgs_mass':
+        bins = np.array([0 + i*((250)/15) for i in range(0, 16)])
+    elif col == 'Higgs_pt':
+        bins = np.array([0 + i*((300)/20) for i in range(0, 21)])
+    elif col == 'Z_pt':
+        bins = np.array([50 + i*((100)/50) for i in range(0, 51)])
+    elif col == 'Z_mass':
+        bins = np.array([75 + i*((30)/50) for i in range(0, 51)])
+    elif col == 'jjVptratio':
+        bins = np.array([0 + i*((2)/15) for i in range(0, 16)])
+    elif col == 'CvsL_max':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsL_min':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsB_max':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'CvsB_min':
+        bins = np.array([0 + i*((1)/20) for i in range(0, 21)])
+    elif col == 'pt_lead':
+        bins = np.array([0 + i*((350)/25) for i in range(0, 26)])
+    elif col == 'pt_sublead':
+        bins = np.array([0 + i*((350)/25) for i in range(0, 26)])
+    elif col == 'del_phi_jjV':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_R_jj':
+        bins = np.array([0 + i*((5)/15) for i in range(0, 16)])
+    elif col == 'del_eta_jj':
+        bins = np.array([0 + i*((3)/15) for i in range(0, 16)])
+    elif col == 'del_phi_ll':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_eta_ll':
+        bins = np.array([0 + i*((2.6)/15) for i in range(0, 16)])
+    elif col == 'del_phi_l2_leading':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    elif col == 'del_phi_l2_subleading':
+        bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 't$\\bar{t}$ bg', 'ZH -> b$\\bar{b}$ \n bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins)
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins,  weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1]
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    #print(np.sqrt(counts2))
+
+     
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    
+    counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2]
+    
+    
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+
+    counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+    unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3]
+    
+    
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+
+    counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+    unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4]
+
+    
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    
+    counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5]
+
+
+    sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6
+    mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]]
+    unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2)
+    unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2)
+    
+    print('rest')
+    print( data_counts, sum_c, unc_sum)
+    #print(data_counts - sum_c)
+    #print((data_counts - sum_c)/sum_c)
+    ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0))
+    unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2)
+    rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    #rax.plot(bins[:-1], ratio, 'ko')
+    rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black')
+    
+    def line(x, a, b):
+      return a*x + b
+    if col == 'Higgs_mass':
+        popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3])
+        print(popt, np.sqrt(pcov))
+        print(1+popt[1])
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Data - MC}{MC}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0 , box.width*0.8, box.height])
+    # Put a legend to the right of the current axis
+    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fancybox = True, shadow = True, ncols = 1, fontsize = 'x-small', labelspacing = 1.6)
+    
+    rax.set_ylim(-0.4, 1.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    if col == 'Higgs_mass':
+        minval, maxval = 0, 250
+    elif col == 'Higgs_pt':
+        minval, maxval = 0, 300
+    elif col == 'Z_pt':
+        minval, maxval = 50, 150
+    elif col == 'Z_mass':
+        minval, maxval = 75, 105
+    elif col == 'jjVptratio':
+        minval, maxval = 0, 2
+    elif col == 'CvsL_max':
+        minval, maxval = 0, 1
+    elif col == 'CvsL_min':
+        minval, maxval = 0, 1
+    elif col == 'CvsB_max':
+        minval, maxval = 0, 1
+    elif col == 'CvsB_min':
+        minval, maxval = 0, 1
+    elif col == 'pt_lead':
+        minval, maxval = 0, 350
+    elif col == 'pt_sublead':
+        minval, maxval = 0, 350
+    elif col == 'del_phi_jjV':
+        minval, maxval = 0, np.pi
+    elif col == 'del_R_jj':
+        minval, maxval = 0, 5
+    elif col == 'del_eta_jj':
+        minval, maxval = 0, 3
+    elif col == 'del_phi_ll':
+        minval, maxval = 0, np.pi
+    elif col == 'del_eta_ll':
+        minval, maxval = 0, 2.6
+    elif col == 'del_phi_l2_leading':
+        minval, maxval = 0, np.pi
+    elif col == 'del_phi_l2_subleading':
+        minval, maxval = 0, np.pi
+    rax.set_xlim(minval, maxval)
+    boxr = rax.get_position()
+    rax.set_position([boxr.x0, boxr.y0, boxr.width*0.8, boxr.height])
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(12, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax)
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    print(f'{col}_{roi}', len_sig, len_dy)
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    if col == 'jjVptratio':
+        bins = np.array([0 + i*((2)/50) for i in range(0, 51)])
+    data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}']))
+    counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal',
+        label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)',
+        histtype="step",
+        color='r',
+        yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig),
+        #yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 50
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    if col == 'jjVptratio':
+        bins = np.array([0 + i*((2)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 't$\\bar{t}$ bg', 'ZH -> b$\\bar{b}$ \n bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins)
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins,  weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))
+    unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1]
+    #print('counts dy')
+    #print(counts2_dy_pure, np.sqrt(counts2_dy_pure))
+    #print(np.sum(counts2_dy_pure))
+    #print('unc dy rescaled')
+    #print(np.sqrt(counts2_dy_pure)*xsec_weights[1])
+    #print(np.sqrt(counts2))
+
+     
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    
+    counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))
+    unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2]
+    
+    
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+
+    counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))
+    unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3]
+    
+    
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+
+    counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))
+    unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4]
+
+    
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    
+    counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))
+    unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5]
+
+
+    sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6
+    mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]]
+    unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2)
+    unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2)
+    
+    print('rest')
+    print( data_counts, sum_c, unc_sum)
+    #print(data_counts - sum_c)
+    #print((data_counts - sum_c)/sum_c)
+    ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0))
+    unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2)
+    rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k')
+    #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    #rax.plot(bins[:-1], ratio, 'ko')
+    rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black')
+    
+    def line(x, a, b):
+      return a*x + b
+    if col == 'Higgs_mass':
+        popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3])
+        print(popt, np.sqrt(pcov))
+        print(1+popt[1])
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Data - MC}{MC}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0 , box.width*0.8, box.height])
+    # Put a legend to the right of the current axis
+    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fancybox = True, shadow = True, ncols = 1, fontsize = 'x-small', labelspacing = 1.6)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    
+    if col == 'jjVptratio':
+        minval, maxval = 0, 2
+    boxr = rax.get_position()
+    rax.set_position([boxr.x0, boxr.y0, boxr.width*0.8, boxr.height])
+    rax.set_xlim(minval, maxval)
+    rax.set_ylim(-0.4, 1.0)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py
new file mode 100644
index 0000000..ec6fbfc
--- /dev/null
+++ b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py
@@ -0,0 +1,813 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+from pathlib import Path
+import os
+from BTVNanoCommissioning.utils.plot_utils import (
+    plotratio,
+   
+)
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_09_14'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/No_dense"):
+    os.mkdir(f"./plot/{folder_save}/No_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense")
+if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"):
+    os.mkdir(f"./plot/{folder_save}/Np_dense_True")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+def autoranger(array):
+    val, axis = array, np.arange(0,len(array)+1)
+    for i in range(len(val)):
+        if val[i] != 0:
+            mins = i
+            break
+    for i in reversed(range(len(val))):
+        if val[i] != 0:
+            maxs = i + 1
+            break
+    print(axis[mins], axis[maxs])
+    return axis[mins], axis[maxs], np.max(val), np.min(val)
+names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+
+roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee']
+roi = 'low_mumu'
+#roi = 'low_ee'
+
+######################################################################################
+##### Read np arrays of signal sample ################################################
+######################################################################################
+'''
+data_path = 'condor_signal_06_mid/'
+paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] 
+#print(paths_np)
+print(len(paths_np))
+df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+
+key_np = {}
+for col in names_sig:
+    for rois in roiis:
+        key_np[f'{col}_{rois}'] = []
+for col in names_sig:
+    for rois in roiis:
+        for path in paths_np:
+            if f'{col}_{rois}' in path:
+                key_np[f'{col}_{rois}'].append(path)
+
+for key in key_np.keys():
+    #print(len(key_np[key]) == len(set(key_np[key])))
+    key_np[key] = [np.load(element) for element in key_np[key]]
+    #print(key)
+    
+print(key_np)
+
+key_np_full = {}
+max_length = 0
+for col in names_sig:
+    for rois in roiis:
+        key_np_full[f'{col}_{rois}'] = np.array([])
+print(key_np_full)
+for key in key_np_full.keys():
+    key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None)
+    print(len(key_np_full[key]))
+    if max_length < len(key_np_full[key]):
+        max_length = len(key_np_full[key])
+
+for key in key_np_full.keys():                 
+    #df_sig_full_np[key] = pd.Series(key_np_full[key])
+    df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key])))))
+#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+print(df_sig_full_np)
+df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]]
+
+print(len(df_s_new_np[f"wei_{roi}"]))
+our_aray_results = len(df_s_new_np[f"wei_{roi}"])
+
+
+
+df_s_new_np = df_s_new_np.dropna()
+print(df_s_new_np)
+len_var = []
+for col in names_sig:
+    len_var.append(len(df_s_new_np[f'{col}_{roi}']))
+    df_s_new_np['target'] = np.ones(np.max(len_var))
+    df_s_new_np['target_bg'] = np.zeros(np.max(len_var))
+print(df_s_new_np)
+
+
+df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False)
+'''
+df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8')
+######################################################################################
+
+
+######################################################################################
+##### Read np arrays of background sample ############################################
+######################################################################################
+'''
+data_path = 'condor_back_09_late/'
+def bg_processor(bg, nr):
+    paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] 
+    #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] 
+    #print(paths_np_back)
+    print(len(paths_np_back))
+    df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+
+    key_np_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_back[f'{col}_{rois}'] = []
+    for col in names_sig:
+        for rois in roiis:
+            for path in paths_np_back:
+                if f'{col}_{rois}' in path:
+                    key_np_back[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_back.keys():
+        print(len(key_np_back[key]) == len(set(key_np_back[key])))
+        for element in key_np_back[key]:
+            print(element)
+            np.load(element, allow_pickle = True)
+        key_np_back[key] = [np.load(element) for element in key_np_back[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_back = 0
+    key_np_full_back = {}
+    for col in names_sig:
+        for rois in roiis:
+            key_np_full_back[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_back.keys():
+        key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None)
+        print(len(key_np_full_back[key]))
+        if max_length_back < len(key_np_full_back[key]):
+            max_length_back = len(key_np_full_back[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_back.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_back_full_np)
+    df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]]
+    df_b_new_np = df_b_full_np.dropna()
+    print(df_b_new_np)
+
+    len_var = []
+    for col in names_sig:
+        len_var.append(len(df_b_new_np[f'{col}_{roi}']))
+        df_b_new_np['target'] = np.zeros(np.max(len_var))
+        df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var))
+    print(df_b_new_np)
+    df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False)
+    return df_b_new_np, len(df_b_new_np['target'])
+'''
+
+#df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1)
+#df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2)
+#df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3)
+#df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4)
+#df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5)
+
+df_b_new_np_dy, len_dy = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_zz, len_zz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_wz, len_wz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_tt, len_tt = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+df_b_new_np_zhtobb, len_zhtobb = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target'])
+
+max_len_bg = 0
+for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]:
+    if max_len_bg < l:
+        max_len_bg = l
+        
+######################################################################################
+##### Read np arrays of data sample ##################################################
+######################################################################################
+data_path = 'condor_back_09_late/'
+datas = ["Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017E_DoubleEG_vau"] #, "Run2017D_DoubleEG_vau", "Run2017F_DoubleEG_vau", "Run2017F_DoubleMu_vau", Run2017D_DoubleEG_vau", "Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau",
+
+for data in datas:
+    
+    paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] 
+
+    print(len(paths_np_data), data)
+    df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+    print(df_data_full_np)
+
+    key_np_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_data[f'{col}_{rois}'] = []
+    for col in names_sig_data:
+        for rois in roiis:
+            for path in paths_np_data:
+                if f'{col}_{rois}' in path:
+                    key_np_data[f'{col}_{rois}'].append(path)
+    #print(key_np_back)
+    for key in key_np_data.keys():
+        print(len(key_np_data[key]) == len(set(key_np_data[key])))
+        for element in key_np_data[key]:
+            print(element)
+            np.load(element, allow_pickle = True)
+        key_np_data[key] = [np.load(element, allow_pickle = True) for element in key_np_data[key]]
+        print(key)
+
+    #print(key_np_back)
+
+    max_length_data = 0
+    key_np_full_data = {}
+    for col in names_sig_data:
+        for rois in roiis:
+            key_np_full_data[f'{col}_{rois}'] = np.array([])
+    for key in key_np_full_data.keys():
+        key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None)
+        print(len(key_np_full_data[key]))
+        if max_length_data < len(key_np_full_data[key]):
+            max_length_data = len(key_np_full_data[key])
+    #print(key_np_full_back)
+
+    for key in key_np_full_data.keys():                 
+        #df_sig_full_np[key] = pd.Series(key_np_full[key])
+        df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key])))))
+    #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis])
+    print(df_data_full_np)
+    for r in roiis:
+        df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis])
+        df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]]
+        df_dat_new_np = df_dat_full_np.dropna()
+        print(df_dat_new_np) 
+        len_var = []
+        for col in names_sig_data:
+            len_var.append(len(df_dat_new_np[f'{col}_{r}']))
+            df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int)
+        print(df_dat_new_np)
+        df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True)
+        df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}.csv', sep=',', encoding='utf-8', index=False)
+df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}.csv', sep=',', encoding='utf-8')
+######################################################################################
+######################################################################################
+#folder_save = 'eval_23_07_25_2'
+df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_wz], ignore_index = True)
+df = pd.concat([df, df_b_new_np_tt], ignore_index = True)
+df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True)
+print(df)
+print(df.info())
+df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False)
+
+xsec_weights = [0.002342*(41480/3323082), 6077.*(41480/102863931), 3.74*(41480/19134840), 
+		6.419*(41480/18136498), 88.51*(41480/105859990), 0.00720*(41480/4337504)]
+
+#xsec_weights = [1 , 1, 1, 1, 1, 1]
+
+#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8')
+
+print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"])))
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] 
+
+names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']  
+
+
+c = 0
+for col in names_sig_data[1:]:
+    
+    plt.figure(figsize=(10,10))
+    len_sig = 0
+    for i in range(0,len(df['target'])):
+        if df['target'][i] == 1:
+             len_sig += 1
+    print(len_sig)
+    names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead']
+    if col in names_big_ax:
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    else:
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot()
+        hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot()
+    if 'pt' in col:
+        if 'ratio' not in col:
+            plt.xlabel('$p_t$ in Gev')
+        else:
+            plt.xlabel('')
+    elif 'mass' in col:
+        plt.xlabel('Mass in Gev')
+    else:
+        plt.xlabel('')
+    plt.ylabel("Counts")
+    plt.title(f'{names_sig_updated[c]}_low_ee')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg")
+
+
+
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}']))
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80)
+    ## plot reference
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    
+    hep.histplot(
+        #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        #label= 'ZH -> cc signal $\cdot 10^5$',
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'ZH -> cc signal',
+        histtype="step",
+        color='r',
+        #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]),
+        yerr = True,
+        ax=ax,
+        density = False,
+    )
+    #for i in range(0, len(bins2)-1):
+    #    x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+    #    y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+    #    label_p_sig = str(counts11[i])
+    #    x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+    #    y_pos = counts2[i] + (counts2[i] * 0.01)
+    #    label_p = str(counts22[i])
+    #    if i%5 == 0:
+    #        ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+    #    if i%6 == 0:
+    #        ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    n_bins = 80
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)])
+    print(bins)
+    hep.histplot(
+        [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])),
+	 np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))],
+        stack = True, 
+        label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'],
+        histtype="fill",
+        color=['g', 'y', 'b', 'm', 'c'],
+        #bins = np.arange(80),
+        yerr = True,
+        #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]),
+	# np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]),
+	 #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])],
+        ax=ax,
+        density = False,
+        alpha = [0.3, 0.3, 0.3, 0.3, 0.3],
+        edgecolor = ["k", "k", "k", "k", "k"],
+        
+        )
+
+    ## plot compare list
+    ax.errorbar(
+        (data_bins[:-1] + data_bins[1:])/2,
+        np.array(data_counts),
+        label='Data',
+        marker = 'o',
+        color='k',
+        yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts))
+        linestyle = "None",
+        )
+    
+    # plot ratio of com/Ref
+    nbinning = 50
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True)
+    counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True)
+    counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True)
+    counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True)
+    counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True)
+    
+    ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0))
+    ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0))
+    ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0))
+    ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0))
+    ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0))
+    rax.plot(bins1[:-1], ratio_dy, 'go')
+    rax.plot(bins1[:-1], ratio_zz, 'yo')
+    rax.plot(bins1[:-1], ratio_wz, 'bo')
+    rax.plot(bins1[:-1], ratio_tt, 'mo')
+    rax.plot(bins1[:-1], ratio_zhtobb, 'co')
+    rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 4.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    #hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg")
+    
+    ######################################################################################################
+    #### No rescaling ####################################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]))
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]))
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density ######################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg")
+
+    ######################################################################################################
+    #### No rescaling  hist density True #################################################################
+    ######################################################################################################
+    fig, ((ax), (rax)) = plt.subplots(
+        2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True
+    )
+    fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97)
+    hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax)
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+
+    counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True)
+    counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True)
+    ## plot reference
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True),
+        label= 'Higgs -> cc',
+        histtype="step",
+        color='r',
+        yerr=True,
+        ax=ax,
+        density = True,
+    )
+    for i in range(0, len(bins2)-1):
+        x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i]
+        y_pos_sig = counts1[i] + (counts1[i] * 0.01)
+        label_p_sig = str(counts11[i])
+        x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i]
+        y_pos = counts2[i] + (counts2[i] * 0.01)
+        label_p = str(counts22[i])
+        if i%5 == 0:
+            ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green')
+        if i%6 == 0:
+            ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red')
+    ## plot compare list
+    hep.histplot(
+        np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True),
+        label='DY bg',
+        histtype="step",
+        color='g',
+        yerr=True,
+        ax=ax,
+        density = True,
+        )
+    # plot ratio of com/Ref
+    
+    counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True)
+    counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True)
+    ratio = np.divide(counts1, counts2, where = (counts2 != 0))
+    plt.plot(bins1[:-1], ratio, 'ko')
+    plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black')
+    
+    
+    ##  plot settings, adjust range
+    rax.set_xlabel(f'{names_sig_updated[c]} {roi}')
+    ax.set_xlabel(None)
+    ax.set_ylabel("Events (normalised)")
+    rax.set_ylabel('$\\frac{Signal}{Background}$')
+    ax.ticklabel_format(style="sci", scilimits=(-3, 3))
+    ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05))
+    ax.legend()
+    rax.set_ylim(0.0, 2.0)
+    xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig]))
+    rax.set_xlim(minval, maxval)
+    at = AnchoredText(
+        "",
+        loc=2,
+        frameon=False,
+    )
+    ax.add_artist(at)
+    hep.mpl_magic(ax=ax)
+    ax.set_ylim(bottom=0)
+
+    logext = ""
+    '''
+    # log y axis
+    if "log" in config.keys() and config["log"]:
+        ax.set_yscale("log")
+        logext = "_log"
+        ax.set_ylim(bottom=0.1)
+        hep.mpl_magic(ax=ax)
+    if "norm" in config.keys() and config["norm"]:
+        logext = "_norm" + logext
+    '''
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf")
+    fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg")
+    
+    c += 1
+
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+
+
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+import xgboost as xgb
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8)
+
+### Fit
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+from xgboost import plot_importance
+from xgboost import plot_tree
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
diff --git a/xgb_test_only_xgb_no_coffea.py b/xgb_test_only_xgb_no_coffea.py
index dd3b1d1..e8bc064 100644
--- a/xgb_test_only_xgb_no_coffea.py
+++ b/xgb_test_only_xgb_no_coffea.py
@@ -17,13 +17,16 @@
 ## Create the folder to save the data if it doesn't exist and read in the dataframe ###
 #######################################################################################
 net_path = "/net/scratch_cms3a/vaulin/"
-folder_save = 'eval_23_04_11'
+folder_save = 'eval_23_07_17_2'
 roi = 'low_mumu'
 if not os.path.exists(f"./plot/{folder_save}"):
     os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/ROI_simple"):
+    os.mkdir(f"./plot/{folder_save}/ROI_simple")
+
 if not os.path.exists(net_path + f"plot/{folder_save}"):
     os.mkdir(net_path + f"plot/{folder_save}")
-df = pd.read_csv(net_path + f'xgb_training_dataset_{roi}.csv')
+df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv')
 
 time = arrow.now().format("YY_MM_DD")
 plt.style.use(hep.style.ROOT)
@@ -33,6 +36,7 @@
 ########## drop target from df and bring it to a separate column, drop weights #########
 ########################################################################################
 X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
 print(X)
 X = X.drop(f"wei_{roi}", axis = 1)
 X = X.drop(f"Z_mass_{roi}", axis = 1)
@@ -91,6 +95,7 @@ def objective(space):
     print("SCORE: ", accuracy)
     return {'loss': -accuracy, 'status': STATUS_OK} 
 
+
 #########################################################################################
 ############# Create pipelines for xgb training #########################################
 #########################################################################################
@@ -132,7 +137,7 @@ def objective(space):
 ############################################################################################################
 ######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
 ############################################################################################################
-cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101)
+cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101)
 folds = [(train, test) for train, test in cv.split(X_train, y_train)]
 #print(folds)
 metrics = ['auc', 'fpr', 'tpr', 'thresholds']
@@ -193,17 +198,17 @@ def convert(x):
         return x.tolist()
     raise TypeError(x)
 
-with open(net_path + f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile:
+with open(f"./plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile:
     #json.dump(results, outfile, indent = 4)
     str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
     outfile.write(str_j)
 
-with open(net_path + f"plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile:
+with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile:
     #json.dump(results, outfile, indent = 4)
     str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
     outfile.write(str_j)
 
-with open(net_path + f"plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile:
+with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile:
     #json.dump(results, outfile, indent = 4)
     str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
     outfile.write(str_j)
@@ -247,10 +252,10 @@ def convert(x):
 fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
 fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
 
-fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.jpg")
-fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.pdf")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff.pdf")
+
 
-'''
 fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
                  go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
                  go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
@@ -260,20 +265,22 @@ def convert(x):
 fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
 fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
 
-fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg")
-fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf")
-'''
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej.pdf")
+
 ##################################################################################################
 ########## Actual hyperparameter tuning ##########################################################
 ##################################################################################################
 
 trials = Trials()
 
-#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
-#print("The best hyperparameters are: ", "\n")
-#print(best_hyperparams)
-
+best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+print("The best hyperparameters are: ", "\n")
+print(best_hyperparams)
 
+##################################################################################################
+##################################################################################################
+##################################################################################################
 
 
 
@@ -291,8 +298,8 @@ def convert(x):
 from sklearn.metrics import accuracy_score
 
 ### Init classifier
-#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
-xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
 
 
 ### Fit
@@ -349,7 +356,7 @@ def convert(x):
 plt.title('Importance plot')
 plt.legend([''])
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
 
 feature_importance = model.get_score(importance_type = 'weight')
 keys = list(feature_importance.keys())
@@ -372,13 +379,13 @@ def convert(x):
 ax2.set_ylabel("Feature names")
 ax2.set_title('Importance plot')
 #plt.show()
-plt.savefig(f"plot/{folder_save}/importance_train.jpg")
+plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train.jpg")
 
 plt.figure(figsize=(17,12))
 plot_tree(xgb_cl, fmap = 'feature_map.txt')
 plt.title('Decision tree graph')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree.jpg", dpi = 1800)
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()'''
 
@@ -386,7 +393,7 @@ def convert(x):
 plot_tree(model_xgb, fmap = 'feature_map.txt')
 plt.title('Decision tree graph')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800)
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train.jpg", dpi = 1800)
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()'''
 '''
diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs.py b/xgb_test_only_xgb_no_coffea_diff_bgs.py
new file mode 100644
index 0000000..db6adf0
--- /dev/null
+++ b/xgb_test_only_xgb_no_coffea_diff_bgs.py
@@ -0,0 +1,416 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+#######################################################################################
+## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+#######################################################################################
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_08'
+roi = 'low_mumu'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/ROI_simple"):
+    os.mkdir(f"./plot/{folder_save}/ROI_simple")
+
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv')
+
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+bg_choice = 2
+bg_choice_2 = 0
+
+eta = 0.03
+#eta = 0.03, 0.12, 0.3, 0.45, 0.8
+
+df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)] 
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+
+########################################################################################
+########## drop target from df and bring it to a separate column, drop weights #########
+########################################################################################
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+########################################################################################
+################# GRID search attempt ##################################################
+########################################################################################
+'''
+from sklearn.model_selection import GridSearchCV
+
+### Creat the parameter grid
+gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+
+gbm = xgb.XGBRegressor()
+
+grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+
+grid_mse.fit(X,y)
+
+
+print("Best parameters found: ", grid_mse.best_params_)
+print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+'''
+
+########################################################################################
+############# An attempt to do hyperparameter tuning for the classifier fit ############
+########################################################################################
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+
+#########################################################################################
+############# Create pipelines for xgb training #########################################
+#########################################################################################
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+#########################################################################################
+############ split dataset into training and test #######################################
+#########################################################################################
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+############################################################################################################
+######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+############################################################################################################
+cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101)
+folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+#print(folds)
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+results = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_zero_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_weak_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta}
+with open(net_path + f"plot/{folder_save}/results_first_{eta}.json", 'w') as outfile:
+    json.dump(results, outfile)
+
+
+
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+for train, test in tqdm(folds, total = len(folds)):
+    print('train')
+    dtrain = xgb.DMatrix(X_train[train,:],
+             label = y_train[train])
+    dval = xgb.DMatrix(X_train[test, :], label = y_train[test])
+    model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal
+    model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal
+    model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal
+    sets = [dtrain, dval, dtest]
+    for i, ds in enumerate(results.keys()):
+        print(i)
+        y_preds = model.predict(sets[i])
+        y_preds_zero_train = model_zero_train.predict(sets[i])
+        y_preds_weak_train = model_weak_train.predict(sets[i])
+        labels = sets[i].get_label()
+        fpr, tpr, thresholds = roc_curve(labels, y_preds)
+        fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train)
+        fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train)
+        results[ds]['fpr'].append(fpr)
+        results[ds]['tpr'].append(tpr)
+        results[ds]['thresholds'].append(thresholds)
+        results[ds]['auc'].append(roc_auc_score(labels, y_preds)) 
+        results_zero_train[ds]['fpr'].append(fpr_zero)
+        results_zero_train[ds]['tpr'].append(tpr_zero)
+        results_zero_train[ds]['thresholds'].append(thresholds_zero)
+        results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) 
+        results_weak_train[ds]['fpr'].append(fpr_weak)
+        results_weak_train[ds]['tpr'].append(tpr_weak)
+        results_weak_train[ds]['thresholds'].append(thresholds_weak)
+        results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train))   
+
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+##########################################################################################################
+############## plotting the ROC curves with uncertainties ################################################
+##########################################################################################################
+kind = 'val'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf")
+
+
+fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.pdf")
+
+##################################################################################################
+########## Actual hyperparameter tuning ##########################################################
+##################################################################################################
+
+trials = Trials()
+
+best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+print("The best hyperparameters are: ", "\n")
+print(best_hyperparams)
+
+##################################################################################################
+##################################################################################################
+##################################################################################################
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg")
+
+feature_importance = model.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py b/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py
new file mode 100644
index 0000000..9f09fd5
--- /dev/null
+++ b/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py
@@ -0,0 +1,417 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+#######################################################################################
+## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+#######################################################################################
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_08'
+roi = 'low_mumu'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/ROI_simple"):
+    os.mkdir(f"./plot/{folder_save}/ROI_simple")
+
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv')
+
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+bg_choice = 2
+bg_choice_2 = 0
+bg_choice_3 = 1
+
+eta = 0.03
+#eta = 0.03, 0.12, 0.3, 0.45, 0.8
+
+df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)|(df.target_bg == bg_choice_3+1)] 
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+
+########################################################################################
+########## drop target from df and bring it to a separate column, drop weights #########
+########################################################################################
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+########################################################################################
+################# GRID search attempt ##################################################
+########################################################################################
+'''
+from sklearn.model_selection import GridSearchCV
+
+### Creat the parameter grid
+gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+
+gbm = xgb.XGBRegressor()
+
+grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+
+grid_mse.fit(X,y)
+
+
+print("Best parameters found: ", grid_mse.best_params_)
+print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+'''
+
+########################################################################################
+############# An attempt to do hyperparameter tuning for the classifier fit ############
+########################################################################################
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+
+#########################################################################################
+############# Create pipelines for xgb training #########################################
+#########################################################################################
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+#########################################################################################
+############ split dataset into training and test #######################################
+#########################################################################################
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+############################################################################################################
+######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+############################################################################################################
+cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101)
+folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+#print(folds)
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+results = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_zero_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_weak_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta}
+with open(net_path + f"plot/{folder_save}/results_first_{eta}.json", 'w') as outfile:
+    json.dump(results, outfile)
+
+
+
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+for train, test in tqdm(folds, total = len(folds)):
+    print('train')
+    dtrain = xgb.DMatrix(X_train[train,:],
+             label = y_train[train])
+    dval = xgb.DMatrix(X_train[test, :], label = y_train[test])
+    model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal
+    model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal
+    model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal
+    sets = [dtrain, dval, dtest]
+    for i, ds in enumerate(results.keys()):
+        print(i)
+        y_preds = model.predict(sets[i])
+        y_preds_zero_train = model_zero_train.predict(sets[i])
+        y_preds_weak_train = model_weak_train.predict(sets[i])
+        labels = sets[i].get_label()
+        fpr, tpr, thresholds = roc_curve(labels, y_preds)
+        fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train)
+        fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train)
+        results[ds]['fpr'].append(fpr)
+        results[ds]['tpr'].append(tpr)
+        results[ds]['thresholds'].append(thresholds)
+        results[ds]['auc'].append(roc_auc_score(labels, y_preds)) 
+        results_zero_train[ds]['fpr'].append(fpr_zero)
+        results_zero_train[ds]['tpr'].append(tpr_zero)
+        results_zero_train[ds]['thresholds'].append(thresholds_zero)
+        results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) 
+        results_weak_train[ds]['fpr'].append(fpr_weak)
+        results_weak_train[ds]['tpr'].append(tpr_weak)
+        results_weak_train[ds]['thresholds'].append(thresholds_weak)
+        results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train))   
+
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+##########################################################################################################
+############## plotting the ROC curves with uncertainties ################################################
+##########################################################################################################
+kind = 'val'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf")
+
+
+fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.pdf")
+
+##################################################################################################
+########## Actual hyperparameter tuning ##########################################################
+##################################################################################################
+
+trials = Trials()
+
+best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+print("The best hyperparameters are: ", "\n")
+print(best_hyperparams)
+
+##################################################################################################
+##################################################################################################
+##################################################################################################
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg")
+
+feature_importance = model.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py b/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py
new file mode 100644
index 0000000..ab9f632
--- /dev/null
+++ b/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py	
@@ -0,0 +1,530 @@
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+from tkinter import filedialog as fd
+from tkinter import messagebox
+
+from tkinter import *
+
+window = Tk()
+window.title("XGBoost training")
+window.minsize(500,300)
+
+###Creating a label, pack() just puts it on screen
+label = Label(master = window ,text = "   Welcome to XGBoost training\n Choose the channel to start with:")
+label.pack(side = TOP, expand = True)
+def objective(space):
+        clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+        evaluation = [(X_train, y_train), (X_test, y_test)]
+        
+        clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+        pred = clf.predict(X_test)
+        accuracy = accuracy_score(y_test, pred>0.5)
+        print("SCORE: ", accuracy)
+        return {'loss': -accuracy, 'status': STATUS_OK} 
+
+def convert(x):
+        if hasattr(x, "tolist"):
+            return x.tolist()
+        raise TypeError(x)
+
+def main(back, eta, chan, file):
+    #######################################################################################
+    ## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+    #######################################################################################
+    net_path = "/net/scratch_cms3a/vaulin/"
+    folder_save = 'eval_23_07_19'
+    roi = 'low_mumu'
+    if not os.path.exists(f"./plot/{folder_save}"):
+        os.mkdir(f"./plot/{folder_save}")
+    if not os.path.exists(f"./plot/{folder_save}/ROI_simple"):
+        os.mkdir(f"./plot/{folder_save}/ROI_simple")
+    
+    df = pd.read_csv(file)
+    
+    
+    bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+    bg_choice = back
+    
+    df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)] 
+    
+    time = arrow.now().format("YY_MM_DD")
+    plt.style.use(hep.style.ROOT)
+    
+    
+    ########################################################################################
+    ########## drop target from df and bring it to a separate column, drop weights #########
+    ########################################################################################
+    X = df.drop("target", axis = 1)
+    X = X.drop("target_bg", axis = 1)
+    print(X)
+    X = X.drop(f"wei_{roi}", axis = 1)
+    X = X.drop(f"Z_mass_{roi}", axis = 1)
+    X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+    X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+    print(X)
+    print(X.info())
+    
+    y = df["target"]
+    print(y)
+    
+    
+    ########################################################################################
+    ################# GRID search attempt ##################################################
+    ########################################################################################
+    '''
+    # =============================================================================
+    # from sklearn.model_selection import GridSearchCV
+    # 
+    # ### Creat the parameter grid
+    # gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+    # 
+    # gbm = xgb.XGBRegressor()
+    # 
+    # grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+    # 
+    # grid_mse.fit(X,y)
+    # 
+    # 
+    # print("Best parameters found: ", grid_mse.best_params_)
+    # print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+    # =============================================================================
+    '''
+    
+    ########################################################################################
+    ############# An attempt to do hyperparameter tuning for the classifier fit ############
+    ########################################################################################
+    space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+             "gamma": hp.uniform("gamma", 1, 9),
+             "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+             "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+             "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+             "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+             "n_estimators": 200,
+             "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+             "subsample": hp.uniform("subsample", 0.8, 1),
+             "seed":0}
+    
+    #learning_rate = space['learning_rate'],
+    
+    
+    
+    
+    #########################################################################################
+    ############# Create pipelines for xgb training #########################################
+    #########################################################################################
+    from sklearn.impute import SimpleImputer
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import OneHotEncoder
+    
+    categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+    
+    from sklearn.preprocessing import StandardScaler
+    numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+    
+    cat_cols = X.select_dtypes(exclude = "number").columns
+    num_cols = X.select_dtypes(include = "number").columns
+    
+    print(cat_cols)
+    print(num_cols)
+    
+    from sklearn.compose import ColumnTransformer
+    
+    full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+    
+    
+    
+    X_processed = full_processor.fit_transform(X)
+    y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+    
+    #########################################################################################
+    ############ split dataset into training and test #######################################
+    #########################################################################################
+    from sklearn.model_selection import train_test_split
+    global X_train
+    global X_test
+    global y_train
+    global y_test
+    X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+    print(X_train)
+    print(X_test)
+    print(y_train)
+    
+    ############################################################################################################
+    ######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+    ############################################################################################################
+    cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101)
+    folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+    #print(folds)
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    results = {'train': {m:[] for m in metrics},
+               'val': {m:[] for m in metrics},
+               'test': {m:[] for m in metrics}}
+    results_zero_train = {'train': {m:[] for m in metrics},
+               'val': {m:[] for m in metrics},
+               'test': {m:[] for m in metrics}}
+    results_weak_train = {'train': {m:[] for m in metrics},
+               'val': {m:[] for m in metrics},
+               'test': {m:[] for m in metrics}}
+    eta = eta
+    
+    params = {'objective': 'binary:logistic',
+              'eval_metric': 'logloss', 'eta': eta}
+    with open(f"./plot/{folder_save}/results_first_{eta}.json", 'w') as outfile:
+        json.dump(results, outfile)
+
+    dtest = xgb.DMatrix(X_test, label=y_test)
+    #print(dtest)
+    for train, test in tqdm(folds, total=len(folds)):
+        print('train')
+        dtrain = xgb.DMatrix(X_train[train, :],
+                             label=y_train[train])
+        dval = xgb.DMatrix(X_train[test, :], label=y_train[test])
+        model = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')],
+                          verbose_eval=1, early_stopping_rounds=10, num_boost_round=200)  # num_boost_round = 1000, 200 is optimal
+        model_zero_train = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')],
+                                     verbose_eval=1, early_stopping_rounds=10, num_boost_round=0)  # num_boost_round = 1000, 200 is optimal
+        model_weak_train = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')],
+                                     verbose_eval=1, early_stopping_rounds=10, num_boost_round=20)  # num_boost_round = 1000, 200 is optimal
+        sets = [dtrain, dval, dtest]
+        for i, ds in enumerate(results.keys()):
+            print(i)
+            y_preds = model.predict(sets[i])
+            y_preds_zero_train = model_zero_train.predict(sets[i])
+            y_preds_weak_train = model_weak_train.predict(sets[i])
+            labels = sets[i].get_label()
+            fpr, tpr, thresholds = roc_curve(labels, y_preds)
+            fpr_zero, tpr_zero, thresholds_zero = roc_curve(
+                labels, y_preds_zero_train)
+            fpr_weak, tpr_weak, thresholds_weak = roc_curve(
+                labels, y_preds_weak_train)
+            results[ds]['fpr'].append(fpr)
+            results[ds]['tpr'].append(tpr)
+            results[ds]['thresholds'].append(thresholds)
+            results[ds]['auc'].append(roc_auc_score(labels, y_preds))
+            results_zero_train[ds]['fpr'].append(fpr_zero)
+            results_zero_train[ds]['tpr'].append(tpr_zero)
+            results_zero_train[ds]['thresholds'].append(thresholds_zero)
+            results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) 
+            results_weak_train[ds]['fpr'].append(fpr_weak)
+            results_weak_train[ds]['tpr'].append(tpr_weak)
+            results_weak_train[ds]['thresholds'].append(thresholds_weak)
+            results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train))   
+    
+    
+    
+    with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile:
+        #json.dump(results, outfile, indent = 4)
+        str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+        outfile.write(str_j)
+    
+    with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile:
+        #json.dump(results, outfile, indent = 4)
+        str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
+        outfile.write(str_j)
+    
+    with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile:
+        #json.dump(results, outfile, indent = 4)
+        str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
+        outfile.write(str_j)
+    
+    ##########################################################################################################
+    ############## plotting the ROC curves with uncertainties ################################################
+    ##########################################################################################################
+    kind = 'val'
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+    
+    fpr_mean = np.linspace(0, 1, 100)
+    
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+    
+    import plotly.graph_objects as go
+    
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                     go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                     go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+    
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+    
+    fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg")
+    fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf")
+    
+    
+    fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                     go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                     go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+    
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+    
+    fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{eta}.jpg")
+    fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{eta}.pdf")
+    
+    ##################################################################################################
+    ########## Actual hyperparameter tuning ##########################################################
+    ##################################################################################################
+    
+    trials = Trials()
+    
+    best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+    print("The best hyperparameters are: ", "\n")
+    print(best_hyperparams)
+    
+    ##################################################################################################
+    ##################################################################################################
+    ##################################################################################################
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    
+    from sklearn.metrics import accuracy_score
+    
+    ### Init classifier
+    xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+    #xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+    
+    
+    ### Fit
+    dtest = xgb.DMatrix(X_test, label = y_test)
+    #print(dtest)
+    dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+    dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+    model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+    sets = [dtrain, dval, dtest]
+    results_new = {'train': {m:[] for m in metrics},
+               'val': {m:[] for m in metrics},
+               'test': {m:[] for m in metrics}}
+    params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+    
+    for i, ds in enumerate(results_new.keys()):
+        print(i)
+        y_preds_new = model_xgb.predict(sets[i])
+        labels_new = sets[i].get_label()
+        fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+        results_new[ds]['fpr'].append(fpr_new)
+        results_new[ds]['tpr'].append(tpr_new)
+        results_new[ds]['thresholds'].append(thresholds_new)
+        results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+    
+    xgb_cl.fit(X_train, y_train)
+    
+    print(xgb_cl)
+    ### Predict
+    preds = xgb_cl.predict(X_test)
+    
+    print(accuracy_score(y_test, preds))
+    
+    print(y_test)
+    print(model_xgb.predict(dtest))
+    print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+    predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+    
+    print(accuracy_score(y_test, predict_train))
+    
+    from xgboost import plot_importance
+    from xgboost import plot_tree, to_graphviz
+    
+    importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+    importances = importances.sort_values(by = "Importance", ascending = False)
+    importances = importances.set_index('Feature')
+    print(importances)
+    importances.plot.bar()
+    
+    fig, ax = plt.subplots(figsize=(17,12))
+    plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+    plt.xlabel('Feature scores')
+    plt.ylabel("Feature names")
+    plt.title('Importance plot')
+    plt.legend([''])
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{eta}.jpg")
+    
+    feature_importance = model.get_score(importance_type = 'weight')
+    keys = list(feature_importance.keys())
+    names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                     'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                     'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                     'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+    values = list(feature_importance.values())
+    data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+    print(data)
+    print(data.index)
+    
+    
+    fig = plt.figure(figsize=(17,12))
+    ax1 = fig.add_subplot(1,2,1)
+    ax1.set_axis_off()
+    ax2 = fig.add_subplot(1,2,2)
+    ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+    ax2.set_xlabel('Feature scores')
+    ax2.set_ylabel("Feature names")
+    ax2.set_title('Importance plot')
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{eta}.jpg")
+    
+    plt.figure(figsize=(17,12))
+    plot_tree(xgb_cl, fmap = 'feature_map.txt')
+    plt.title('Decision tree graph')
+    #plt.show()
+    plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{eta}.jpg", dpi = 1800)
+    ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+    #plt.show()'''
+    
+    #plt.figure(figsize=(17,12))
+    #plot_tree(model_xgb, fmap = 'feature_map.txt')
+    #plt.title('Decision tree graph')
+    #plt.show()
+    #plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{eta}.jpg", dpi = 1800)
+    ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+    #plt.show()'''
+'''
+    plt.figure(figsize=(17,12))
+    to_graphviz(model_xgb, fmap = 'feature_map.txt')
+    plt.title('Decision tree graph')
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+    ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+    #plt.show()'''
+        
+        
+###Checkbutton
+def checkused():
+    global checkbutton
+    ##Returns 1 if used, otherwise 0
+    print(checked_state.get())
+    label.config(text = "Enjoy the ride - it will take some time")
+    checkbutton.pack_forget()
+    main(back_to_use[0], eta_to_use[1], chan_to_use[1], file_to_use)
+    
+    
+    
+checked_state = IntVar()
+checkbutton = Checkbutton(text = "Ready to start?", variable = checked_state, command = checkused)
+
+
+
+
+back_to_use = 0
+eta_to_use = 0
+chan_to_use = 0
+file_to_use = 0
+
+###Listbox
+def listboxused(event):
+    global back_to_use
+    global listbox_eta
+    #print(listbox.get(listbox.curselection()))
+    back_to_use = int(list(listbox.curselection())[0]), listbox.get(listbox.curselection())
+    label.config(text = "Now choose the learning rate eta to work with")
+    listbox.pack_forget()
+    
+    
+    listbox_eta.pack()
+    
+def listboxused_eta(event):
+    global eta_to_use
+    global checkbutton
+    print(listbox_eta.get(listbox_eta.curselection()))
+    eta_to_use = int(list(listbox_eta.curselection())[0]), listbox_eta.get(listbox_eta.curselection())
+    label.config(text = "You are ready to start")
+    listbox_eta.pack_forget()
+    messagebox.showinfo(title = "Training info",
+         message = f"Channel: {chan_to_use[1]}\nFile: {file_to_use}\nBackground: {back_to_use[1]}\nLearning rate (eta): {eta_to_use[1]}")
+    checkbutton.pack()
+
+###Listbox
+def listboxused_chan(event):
+    global chan_to_use
+    global file_to_use
+    global listbox
+    
+    print(listbox_chan.get(listbox_chan.curselection()))
+    chan_to_use = int(list(listbox_chan.curselection())[0]), listbox_chan.get(listbox_chan.curselection())
+    label.config(text = "Now choose the background to work with")
+    listbox_chan.pack_forget()
+    filetypes = (
+        ("text files", "*.txt"),
+        ("csv files", "*.csv"),
+        ("All files", "*.*"))
+    filename = fd.askopenfile(
+        title = "Open a file",
+        initialdir = "./plot/",
+        filetypes = filetypes)
+    
+    messagebox.showinfo(title = "Selected file",
+         message = filename.name)
+    file_to_use = filename.name
+    
+    listbox.pack()
+
+
+listbox = Listbox(height = 5)
+names = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+for bglist in names:
+    listbox.insert(names.index(bglist), bglist)
+listbox.bind("<<ListboxSelect>>", listboxused)
+
+listbox_eta = Listbox(height = 5)
+etas = [0.03, 0.12, 0.3, 0.45, 0.8]
+for eta in etas:
+    listbox_eta.insert(etas.index(eta), eta)
+listbox_eta.bind("<<ListboxSelect>>", listboxused_eta)
+#listbox_eta.pack()
+
+listbox_chan = Listbox(height = 4)
+names_chan = ['low_mumu', 'low_ee', 'high_mumu', 'high_ee']
+for chan in names_chan:
+    listbox_chan.insert(names_chan.index(chan), chan)
+listbox_chan.bind("<<ListboxSelect>>", listboxused_chan)
+listbox_chan.pack()
+
+
+
+
+window.mainloop()
diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py b/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py
new file mode 100644
index 0000000..f8d2e2b
--- /dev/null
+++ b/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py
@@ -0,0 +1,418 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+#######################################################################################
+## Create the folder to save the data if it doesn't exist and read in the dataframe ###
+#######################################################################################
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_23_2'
+roi = 'low_mumu'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/ROI_simple"):
+    os.mkdir(f"./plot/{folder_save}/ROI_simple")
+if not os.path.exists(f"./plot/{folder_save}/ROI_simple/{roi}"):
+    os.mkdir(f"./plot/{folder_save}/ROI_simple/{roi}")
+
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv')
+
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+#bg_choice = 2
+#bg_choice_2 = 0
+
+eta = 0.1
+#eta = 0.03, 0.12, 0.3, 0.45, 0.8
+
+#df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)] 
+
+time = arrow.now().format("YY_MM_DD")
+plt.style.use(hep.style.ROOT)
+
+
+########################################################################################
+########## drop target from df and bring it to a separate column, drop weights #########
+########################################################################################
+X = df.drop("target", axis = 1)
+X = X.drop("target_bg", axis = 1)
+print(X)
+X = X.drop(f"wei_{roi}", axis = 1)
+X = X.drop(f"Z_mass_{roi}", axis = 1)
+X = X.drop(f"Z_pt_gen_{roi}", axis = 1)
+X = X.drop(f"Z_mass_gen_{roi}", axis = 1)
+print(X)
+print(X.info())
+
+y = df["target"]
+print(y)
+
+
+########################################################################################
+################# GRID search attempt ##################################################
+########################################################################################
+'''
+from sklearn.model_selection import GridSearchCV
+
+### Creat the parameter grid
+gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]}
+
+gbm = xgb.XGBRegressor()
+
+grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1)
+
+grid_mse.fit(X,y)
+
+
+print("Best parameters found: ", grid_mse.best_params_)
+print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))  
+'''
+
+########################################################################################
+############# An attempt to do hyperparameter tuning for the classifier fit ############
+########################################################################################
+space = {"max_depth": hp.quniform("max_depth", 3, 18, 1),
+         "gamma": hp.uniform("gamma", 1, 9),
+         "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1),
+         "reg_lambda": hp.uniform("reg_lambda", 0, 1),
+         "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1),
+         "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1),
+         "n_estimators": 200,
+         "learning_rate": hp.uniform("learning_rate", 0.001, 0.1),
+         "subsample": hp.uniform("subsample", 0.8, 1),
+         "seed":0}
+
+#learning_rate = space['learning_rate'],
+
+def objective(space):
+    clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10)
+    evaluation = [(X_train, y_train), (X_test, y_test)]
+    
+    clf.fit(X_train, y_train, eval_set = evaluation, verbose = False)
+    pred = clf.predict(X_test)
+    accuracy = accuracy_score(y_test, pred>0.5)
+    print("SCORE: ", accuracy)
+    return {'loss': -accuracy, 'status': STATUS_OK} 
+
+
+#########################################################################################
+############# Create pipelines for xgb training #########################################
+#########################################################################################
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder
+
+categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+from sklearn.preprocessing import StandardScaler
+numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+cat_cols = X.select_dtypes(exclude = "number").columns
+num_cols = X.select_dtypes(include = "number").columns
+
+print(cat_cols)
+print(num_cols)
+
+from sklearn.compose import ColumnTransformer
+
+full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+X_processed = full_processor.fit_transform(X)
+y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+#########################################################################################
+############ split dataset into training and test #######################################
+#########################################################################################
+from sklearn.model_selection import train_test_split
+
+X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218)
+#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+print(X_train)
+print(X_test)
+print(y_train)
+
+############################################################################################################
+######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold #############
+############################################################################################################
+cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101)
+folds = [(train, test) for train, test in cv.split(X_train, y_train)]
+#print(folds)
+metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+results = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_zero_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+results_weak_train = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+
+params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta}
+with open(net_path + f"plot/{folder_save}/results_first_{roi}_{eta}.json", 'w') as outfile:
+    json.dump(results, outfile)
+
+
+
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+for train, test in tqdm(folds, total = len(folds)):
+    print('train')
+    dtrain = xgb.DMatrix(X_train[train,:],
+             label = y_train[train])
+    dval = xgb.DMatrix(X_train[test, :], label = y_train[test])
+    model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal
+    model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal
+    model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                      verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal
+    sets = [dtrain, dval, dtest]
+    for i, ds in enumerate(results.keys()):
+        print(i)
+        y_preds = model.predict(sets[i])
+        y_preds_zero_train = model_zero_train.predict(sets[i])
+        y_preds_weak_train = model_weak_train.predict(sets[i])
+        labels = sets[i].get_label()
+        fpr, tpr, thresholds = roc_curve(labels, y_preds)
+        fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train)
+        fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train)
+        results[ds]['fpr'].append(fpr)
+        results[ds]['tpr'].append(tpr)
+        results[ds]['thresholds'].append(thresholds)
+        results[ds]['auc'].append(roc_auc_score(labels, y_preds)) 
+        results_zero_train[ds]['fpr'].append(fpr_zero)
+        results_zero_train[ds]['tpr'].append(tpr_zero)
+        results_zero_train[ds]['thresholds'].append(thresholds_zero)
+        results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) 
+        results_weak_train[ds]['fpr'].append(fpr_weak)
+        results_weak_train[ds]['tpr'].append(tpr_weak)
+        results_weak_train[ds]['thresholds'].append(thresholds_weak)
+        results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train))   
+
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+with open(f"./plot/{folder_save}/results_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_zero_train_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+with open(f"./plot/{folder_save}/results_weak_train_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile:
+    #json.dump(results, outfile, indent = 4)
+    str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert)
+    outfile.write(str_j)
+
+##########################################################################################################
+############## plotting the ROC curves with uncertainties ################################################
+##########################################################################################################
+kind = 'val'
+
+c_fill = 'rgba(52, 152, 219, 0.2)'
+c_line = 'rgba(52, 152, 219, 0.5)'
+c_line_main = 'rgba(41, 128, 185, 1.0)'
+c_grid = 'rgba(189, 195, 199, 0.5)'
+c_annot = 'rgba(149, 165, 166, 0.5)'
+c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+fpr_mean = np.linspace(0, 1, 100)
+
+interp_tprs = []
+for i in range(100):
+    fpr = results[kind]['fpr'][i]
+    tpr = results[kind]['tpr'][i]
+    interp_tpr = np.interp(fpr_mean, fpr, tpr)
+    interp_tpr[0] = 0.0
+    interp_tprs.append(interp_tpr)
+tpr_mean = np.mean(interp_tprs, axis = 0)
+tpr_mean[-1] = 1.0
+tpr_std = 2*np.std(interp_tprs, axis = 0)
+tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+tpr_lower = tpr_mean - tpr_std
+auc = np.mean(results[kind]['auc'])
+
+import plotly.graph_objects as go
+
+fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_eff_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_eff_{eta}.pdf")
+
+
+fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')])
+
+fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_rej_bg_full_bg_set_{eta}.jpg")
+fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_rej_bg_full_bg_set_{eta}.pdf")
+
+##################################################################################################
+########## Actual hyperparameter tuning ##########################################################
+##################################################################################################
+
+trials = Trials()
+
+best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials)
+print("The best hyperparameters are: ", "\n")
+print(best_hyperparams)
+
+##################################################################################################
+##################################################################################################
+##################################################################################################
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+from sklearn.metrics import accuracy_score
+
+### Init classifier
+xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree'])
+#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994)
+
+
+### Fit
+dtest = xgb.DMatrix(X_test, label = y_test)
+#print(dtest)
+dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)])
+dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):])
+model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+sets = [dtrain, dval, dtest]
+results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+for i, ds in enumerate(results_new.keys()):
+    print(i)
+    y_preds_new = model_xgb.predict(sets[i])
+    labels_new = sets[i].get_label()
+    fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+    results_new[ds]['fpr'].append(fpr_new)
+    results_new[ds]['tpr'].append(tpr_new)
+    results_new[ds]['thresholds'].append(thresholds_new)
+    results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))  
+
+xgb_cl.fit(X_train, y_train)
+
+print(xgb_cl)
+### Predict
+preds = xgb_cl.predict(X_test)
+
+print(accuracy_score(y_test, preds))
+
+print(y_test)
+print(model_xgb.predict(dtest))
+print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+
+print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+importances = importances.sort_values(by = "Importance", ascending = False)
+importances = importances.set_index('Feature')
+print(importances)
+importances.plot.bar()
+
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/importance_bg_{roi}_full_bg_set_{eta}.jpg")
+
+feature_importance = model.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/importance_train_bg_full_bg_set_{eta}.jpg")
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/boost_tree_bg_full_bg_set_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/boost_tree_train_bg_full_bg_set_{eta}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea.py b/xgb_test_only_xgb_reloaded_no_coffea.py
index bc91384..eaebb78 100644
--- a/xgb_test_only_xgb_reloaded_no_coffea.py
+++ b/xgb_test_only_xgb_reloaded_no_coffea.py
@@ -14,7 +14,7 @@
 import json
 
 net_path = "/net/scratch_cms3a/vaulin/"
-folder_save = 'eval_23_05_02'
+folder_save = 'eval_23_06_26_2'
 if not os.path.exists(f"./plot/{folder_save}"):
     os.mkdir(f"./plot/{folder_save}")
 if not os.path.exists(net_path + f"plot/{folder_save}"):
@@ -133,22 +133,22 @@ def pretty_ROC_Curve(tr_set, kind, type):
     fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
     fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
 
-    fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg")
-    fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf")
+    fig.write_image(f"./plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg")
+    fig.write_image(f"./plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf")
 
 
-pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full")
+pretty_ROC_Curve(f"./plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full")
 ##############################################################################################################################################################
 ##################### Zero train ROC #########################################################################################################################
 ##############################################################################################################################################################
 
-pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero')
+pretty_ROC_Curve(f"./plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero')
 
 ##############################################################################################################################################################
 ##################### Weak train ROC #########################################################################################################################
 ##############################################################################################################################################################
 
-pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak')
+pretty_ROC_Curve(f"./plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak')
 
 ##############################################################################################################################################################
 
@@ -223,7 +223,7 @@ def pretty_ROC_Curve(tr_set, kind, type):
 plt.title('Importance plot')
 plt.legend([''])
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/importance.jpg")
+plt.savefig(f"./plot/{folder_save}/importance.jpg")
 
 
 feature_importance = model_xgb.get_score(importance_type = 'weight')
@@ -247,14 +247,14 @@ def pretty_ROC_Curve(tr_set, kind, type):
 ax2.set_ylabel("Feature names")
 ax2.set_title('Importance plot')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg")
+plt.savefig(f"./plot/{folder_save}/importance_train_lr_{learning_rate}.jpg")
 
 
 plt.figure(figsize=(17,12))
 plot_tree(xgb_cl, fmap = 'feature_map.txt')
 plt.title('Decision tree graph')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800)
+plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800)
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()'''
 
@@ -262,7 +262,7 @@ def pretty_ROC_Curve(tr_set, kind, type):
 plot_tree(model_xgb, fmap = 'feature_map.txt')
 plt.title('Decision tree graph')
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800)
+plt.savefig(f"./plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800)
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()'''
 
@@ -273,7 +273,7 @@ def pretty_ROC_Curve(tr_set, kind, type):
 plt.title('Classifier output')
 plt.legend(['Train output', 'Train output after threshold','Test data'])
 #plt.show()
-plt.savefig(net_path + f"plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg")
+plt.savefig(f"./plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg")
 ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
 #plt.show()'''
 
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_var.py b/xgb_test_only_xgb_reloaded_no_coffea_var.py
index 4451ef9..6e1aa4c 100644
--- a/xgb_test_only_xgb_reloaded_no_coffea_var.py
+++ b/xgb_test_only_xgb_reloaded_no_coffea_var.py
@@ -14,7 +14,7 @@
 import json
 
 net_path = "/net/scratch_cms3a/vaulin/"
-folder_save = 'eval_23_05_02'
+folder_save = 'eval_23_06_26_2'
 if not os.path.exists(f"./plot/{folder_save}"):
     os.mkdir(f"./plot/{folder_save}")
 if not os.path.exists(net_path + f"plot/{folder_save}"):
@@ -29,11 +29,14 @@
                  'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
                  'del_phi_l2_subleading', 'del_phi_l2_leading']
 
-var = f'Higgs_mass_{roi}'
+var = f'del_phi_l2_subleading_{roi}'
 
 time = arrow.now().format("YY_MM_DD")
 plt.style.use(hep.style.ROOT)
 
+
+df  = df.sample(frac = 1).reset_index(drop=True)
+
 X = df[var]
 print(X)
 print(X.info())
@@ -271,8 +274,68 @@ def pretty_ROC_Curve_var(results, kind, type, var):
     fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg")
     fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") 
 
+def pretty_ROC_Curve_var_test_train_val(results, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_train = 'rgba(41, 128, 185, 1.0)'
+    c_line_test = 'rgba(58, 217, 19, 0.8)'
+    c_line_val = 'rgba(244, 70, 10, 0.8)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+    colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val}
+    fig_test = 0
+    fig_train = 0
+    fig_val = 0
+    figs = {'test': fig_test, 'train': fig_train, 'val': fig_val}
+    for kind in ['test', 'val', 'train']:
+        for i in range(1):
+            fpr = results[kind]['fpr'][i]
+            tpr = results[kind]['tpr'][i]
+            interp_tpr = np.interp(fpr_mean, fpr, tpr)
+            interp_tpr[0] = 0.0
+            interp_tprs.append(interp_tpr)
+        tpr_mean = np.mean(interp_tprs, axis = 0)
+        tpr_mean[-1] = 1.0
+        tpr_std = 2*np.std(interp_tprs, axis = 0)
+        tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+        tpr_lower = tpr_mean - tpr_std
+        auc = np.mean(results[kind]['auc'])
+        colour = colours[kind]
+    
+
+        figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}')
+    fig = go.Figure(data = [figs['test'], figs['train'], figs['val']])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black')
+    
+    if not os.path.exists(f"plot/{folder_save}/ROC"):
+        os.mkdir(f"plot/{folder_save}/ROC") 
+
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") 
+
 pretty_ROC_Curve_var(results_new, 'test', 'full', var)
 
+pretty_ROC_Curve_var_test_train_val(results_new, 'full', var)
+
 xgb_cl.fit(X_train, y_train)
 
 print(xgb_cl)
@@ -318,7 +381,7 @@ def pretty_ROC_Curve_var(results, kind, type, var):
                  '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
                  '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
                  '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']'''
-names_sig = ['m(H)'] 
+names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] 
 values = list(feature_importance.values())
 data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
 print(data)
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars.py b/xgb_test_only_xgb_reloaded_no_coffea_vars.py
new file mode 100644
index 0000000..93eed4e
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea_vars.py
@@ -0,0 +1,521 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_07_19'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"):
+    os.mkdir(f"./plot/{folder_save}/Diff_ROCs")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv')
+
+roi = 'low_mumu'
+learning_rate = 0.12
+
+from itertools import combinations
+
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+names_ind_array = np.arange(len(names_sig))
+
+possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))]
+#print(possible_combos)
+print(possible_combos[1])
+#print(possible_combos[-1])
+
+
+length = [len(possible_el) for possible_el in possible_combos]
+print(length)
+
+import random
+sequence_list = np.arange(0,len(names_sig))
+#print(sequence_list)
+random.shuffle(sequence_list)
+print(sequence_list)
+
+interesting_combos = []
+combos = []
+
+for i in range(0, len(length)):
+    #print([len(elem) for elem in possible_combos[i]])
+    for j in range(0, len(possible_combos[i])):
+        #print(list(possible_combos[i][j]))
+        #print(list(sequence_list[:i]))
+        if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])):
+            print(sorted(list(possible_combos[i][j])))
+            print(sorted(list(sequence_list[:(i+1)])))
+            print(i, j)
+            combos.append([i,j])
+            interesting_combos.append(sorted(list(possible_combos[i][j])))
+
+print(combos)
+#for k in range(0,len(combos)):
+    
+print(interesting_combos)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type, var):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf")
+
+def pretty_ROC_Curve_var(results, kind, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(1):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") 
+
+def pretty_ROC_Curve_var_test_train_val(results, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_train = 'rgba(41, 128, 185, 1.0)'
+    c_line_test = 'rgba(58, 217, 19, 0.8)'
+    c_line_val = 'rgba(244, 70, 10, 0.8)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+    colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val}
+    fig_test = 0
+    fig_train = 0
+    fig_val = 0
+    figs = {'test': fig_test, 'train': fig_train, 'val': fig_val}
+    for kind in ['test', 'val', 'train']:
+        for i in range(1):
+            fpr = results[kind]['fpr'][i]
+            tpr = results[kind]['tpr'][i]
+            interp_tpr = np.interp(fpr_mean, fpr, tpr)
+            interp_tpr[0] = 0.0
+            interp_tprs.append(interp_tpr)
+        tpr_mean = np.mean(interp_tprs, axis = 0)
+        tpr_mean[-1] = 1.0
+        tpr_std = 2*np.std(interp_tprs, axis = 0)
+        tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+        tpr_lower = tpr_mean - tpr_std
+        auc = np.mean(results[kind]['auc'])
+        colour = colours[kind]
+    
+
+        figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}')
+    fig = go.Figure(data = [figs['test'], figs['train'], figs['val']])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black')
+    
+    if not os.path.exists(f"plot/{folder_save}/ROC"):
+        os.mkdir(f"plot/{folder_save}/ROC") 
+
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") 
+###############################################################################################################################
+
+for versions in interesting_combos:
+    versions_true = [int(version) for version in versions]
+    versions = [True if value in versions_true else False for value in range(0, len(names_sig))]
+    print(versions)
+    print(np.array(names_sig)[versions])
+    var = np.array(names_sig)[versions]
+    var = [f"{va}_{roi}" for va in var] 
+
+    time = arrow.now().format("YY_MM_DD")
+    plt.style.use(hep.style.ROOT)
+
+
+    df  = df.sample(frac = 1).reset_index(drop=True)
+
+    X = df[list(var)]
+    print(X)
+    print(X.info())
+ 
+    X_signal = df[var][df.target == 1]
+    X_bg = df[var][df.target == 0]
+
+    y = df["target"]
+    print(y)
+
+    y_signal = df["target"][df.target == 1]
+    y_bg = df["target"][df.target == 0]
+
+    from sklearn.impute import SimpleImputer
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import OneHotEncoder
+ 
+    categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+    from sklearn.preprocessing import StandardScaler
+    numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+    cat_cols = X.select_dtypes(exclude = "number").columns
+    num_cols = X.select_dtypes(include = "number").columns
+
+    print(cat_cols)
+    print(num_cols)
+
+    from sklearn.compose import ColumnTransformer
+
+    full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+    X_processed = full_processor.fit_transform(X)
+    y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+    y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1))
+    y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1))
+
+    from sklearn.model_selection import train_test_split
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218)
+    X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218)
+    X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218)
+    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+    print(X_train)
+    print(X_test)
+    print(y_train)
+
+
+
+
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full", versions_true)
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero', versions_true)
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak', versions_true)
+
+##############################################################################################################################################################
+
+
+    trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+    from sklearn.metrics import accuracy_score
+
+### Init classifier
+    xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10)
+
+### Fit
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    dtest = xgb.DMatrix(X_test, label = y_test)
+    dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig)
+    dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg)
+#print(dtest)
+    dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)])
+    dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):])
+    model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+    model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000,
+    model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000,
+    sets = [dtrain, dval, dtest]
+    results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_weak = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_zero = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+    for i, ds in enumerate(results_new.keys()):
+        print(i)
+        y_preds_new = model_xgb.predict(sets[i])
+        y_preds_new_weak = model_xgb_weak.predict(sets[i])
+        y_preds_new_zero = model_xgb_zero.predict(sets[i])
+        labels_new = sets[i].get_label()
+        fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+        fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak)
+        fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero)
+        results_new[ds]['fpr'].append(fpr_new)
+        results_new[ds]['tpr'].append(tpr_new)
+        results_new[ds]['thresholds'].append(thresholds_new)
+        results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))
+        results_new_weak[ds]['fpr'].append(fpr_new_weak)
+        results_new_weak[ds]['tpr'].append(tpr_new_weak)
+        results_new_weak[ds]['thresholds'].append(thresholds_new_weak)
+        results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) 
+        results_new_zero[ds]['fpr'].append(fpr_new_zero)
+        results_new_zero[ds]['tpr'].append(tpr_new_zero)
+        results_new_zero[ds]['thresholds'].append(thresholds_new_zero)
+        results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero))  
+
+
+
+    pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true)
+
+    pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true)
+
+    xgb_cl.fit(X_train, y_train)
+
+    print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+    preds = xgb_cl.predict(X_test)
+
+    print(accuracy_score(y_test, preds))
+
+    print(y_test)
+    print(model_xgb.predict(dtest))
+    print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+    predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+    predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)])
+    predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)])
+
+    print(accuracy_score(y_test, predict_train))
+
+from xgboost import plot_importance
+from xgboost import plot_tree, to_graphviz
+
+#importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+#importances = importances.sort_values(by = "Importance", ascending = False)
+#importances = importances.set_index('Feature')
+#print(importances)
+#importances.plot.bar()
+'''
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_{var}.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']
+names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+'''
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False)
+plt.title('Classifier output')
+plt.legend(['Signal', 'Background'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak.jpg")
+
+plt.figure(figsize=(17,12))
+plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+plt.title('Classifier output')
+plt.legend(['Train output', 'Train output after threshold','Test data'])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero.jpg")
+
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+with open(f"plot/{folder_save}/ROC.txt", "a") as myfile:
+            myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + "  " + '\n')
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py
new file mode 100644
index 0000000..c5ad2ac
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py
@@ -0,0 +1,524 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_07_19'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"):
+    os.mkdir(f"./plot/{folder_save}/Diff_ROCs")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv('xgb_training_dataset_low_mumu.csv')
+
+roi = 'low_mumu'
+learning_rate = 0.12
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+bg_choice = 2
+
+from itertools import combinations
+
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+names_ind_array = np.arange(len(names_sig))
+
+possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))]
+#print(possible_combos)
+print(possible_combos[1])
+#print(possible_combos[-1])
+
+
+length = [len(possible_el) for possible_el in possible_combos]
+print(length)
+
+import random
+sequence_list = np.arange(0,len(names_sig))
+#print(sequence_list)
+random.shuffle(sequence_list)
+print(sequence_list)
+
+interesting_combos = []
+combos = []
+
+for i in range(0, len(length)):
+    #print([len(elem) for elem in possible_combos[i]])
+    for j in range(0, len(possible_combos[i])):
+        #print(list(possible_combos[i][j]))
+        #print(list(sequence_list[:i]))
+        if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])):
+            print(sorted(list(possible_combos[i][j])))
+            print(sorted(list(sequence_list[:(i+1)])))
+            print(i, j)
+            combos.append([i,j])
+            interesting_combos.append(sorted(list(possible_combos[i][j])))
+
+print(combos)
+#for k in range(0,len(combos)):
+    
+print(interesting_combos)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type, var):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}.pdf")
+
+def pretty_ROC_Curve_var(results, kind, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(1):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}.pdf") 
+
+def pretty_ROC_Curve_var_test_train_val(results, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_train = 'rgba(41, 128, 185, 1.0)'
+    c_line_test = 'rgba(58, 217, 19, 0.8)'
+    c_line_val = 'rgba(244, 70, 10, 0.8)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+    colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val}
+    fig_test = 0
+    fig_train = 0
+    fig_val = 0
+    figs = {'test': fig_test, 'train': fig_train, 'val': fig_val}
+    for kind in ['test', 'val', 'train']:
+        for i in range(1):
+            fpr = results[kind]['fpr'][i]
+            tpr = results[kind]['tpr'][i]
+            interp_tpr = np.interp(fpr_mean, fpr, tpr)
+            interp_tpr[0] = 0.0
+            interp_tprs.append(interp_tpr)
+        tpr_mean = np.mean(interp_tprs, axis = 0)
+        tpr_mean[-1] = 1.0
+        tpr_std = 2*np.std(interp_tprs, axis = 0)
+        tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+        tpr_lower = tpr_mean - tpr_std
+        auc = np.mean(results[kind]['auc'])
+        colour = colours[kind]
+    
+
+        figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}')
+    fig = go.Figure(data = [figs['test'], figs['train'], figs['val']])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black')
+    
+    if not os.path.exists(f"plot/{folder_save}/ROC"):
+        os.mkdir(f"plot/{folder_save}/ROC") 
+
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}.jpg")
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}.pdf") 
+###############################################################################################################################
+
+for versions in interesting_combos:
+    versions_true = [int(version) for version in versions]
+    versions = [True if value in versions_true else False for value in range(0, len(names_sig))]
+    print(versions)
+    print(np.array(names_sig)[versions])
+    var = np.array(names_sig)[versions]
+    var = [f"{va}_{roi}" for va in var] 
+
+    time = arrow.now().format("YY_MM_DD")
+    plt.style.use(hep.style.ROOT)
+
+
+    df  = df.sample(frac = 1).reset_index(drop=True)
+
+    X = df[list(var)]
+    print(X)
+    print(X.info())
+ 
+    X_signal = df[var][df.target == 1]
+    X_bg = df[var][df.target == 0]
+
+    y = df["target"]
+    print(y)
+
+    y_signal = df["target"][df.target == 1]
+    y_bg = df["target"][df.target == 0]
+
+    from sklearn.impute import SimpleImputer
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import OneHotEncoder
+ 
+    categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+    from sklearn.preprocessing import StandardScaler
+    numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+    cat_cols = X.select_dtypes(exclude = "number").columns
+    num_cols = X.select_dtypes(include = "number").columns
+
+    print(cat_cols)
+    print(num_cols)
+
+    from sklearn.compose import ColumnTransformer
+
+    full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+    X_processed = full_processor.fit_transform(X)
+    y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+    y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1))
+    y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1))
+
+    from sklearn.model_selection import train_test_split
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218)
+    X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218)
+    X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218)
+    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+    print(X_train)
+    print(X_test)
+    print(y_train)
+
+
+
+
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, "full", versions_true)
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, 'zero', versions_true)
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, 'weak', versions_true)
+
+##############################################################################################################################################################
+
+
+    trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+    from sklearn.metrics import accuracy_score
+
+### Init classifier
+    xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10)
+
+### Fit
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    dtest = xgb.DMatrix(X_test, label = y_test)
+    dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig)
+    dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg)
+#print(dtest)
+    dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)])
+    dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):])
+    model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+    model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000,
+    model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000,
+    sets = [dtrain, dval, dtest]
+    results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_weak = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_zero = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+    for i, ds in enumerate(results_new.keys()):
+        print(i)
+        y_preds_new = model_xgb.predict(sets[i])
+        y_preds_new_weak = model_xgb_weak.predict(sets[i])
+        y_preds_new_zero = model_xgb_zero.predict(sets[i])
+        labels_new = sets[i].get_label()
+        fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+        fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak)
+        fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero)
+        results_new[ds]['fpr'].append(fpr_new)
+        results_new[ds]['tpr'].append(tpr_new)
+        results_new[ds]['thresholds'].append(thresholds_new)
+        results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))
+        results_new_weak[ds]['fpr'].append(fpr_new_weak)
+        results_new_weak[ds]['tpr'].append(tpr_new_weak)
+        results_new_weak[ds]['thresholds'].append(thresholds_new_weak)
+        results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) 
+        results_new_zero[ds]['fpr'].append(fpr_new_zero)
+        results_new_zero[ds]['tpr'].append(tpr_new_zero)
+        results_new_zero[ds]['thresholds'].append(thresholds_new_zero)
+        results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero))  
+
+
+
+    pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true)
+
+    pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true)
+
+    xgb_cl.fit(X_train, y_train)
+
+    print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+    preds = xgb_cl.predict(X_test)
+
+    print(accuracy_score(y_test, preds))
+
+    print(y_test)
+    print(model_xgb.predict(dtest))
+    print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+    predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+    predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)])
+    predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)])
+
+    print(accuracy_score(y_test, predict_train))
+
+    from xgboost import plot_importance
+    from xgboost import plot_tree, to_graphviz
+
+    #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+    #importances = importances.sort_values(by = "Importance", ascending = False)
+    #importances = importances.set_index('Feature')
+    #print(importances)
+    #importances.plot.bar()
+    '''
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_{var}.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']
+names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+    '''
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_{bgs[bg_choice]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg_{bgs[bg_choice]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak_{bgs[bg_choice]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero_{bgs[bg_choice]}.jpg")
+
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+with open(f"plot/{folder_save}/ROC.txt", "a") as myfile:
+            myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + "  " + '\n')
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py
new file mode 100644
index 0000000..c6d21b4
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py
@@ -0,0 +1,525 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_08'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"):
+    os.mkdir(f"./plot/{folder_save}/Diff_ROCs")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+df = pd.read_csv('xgb_training_dataset_low_mumu.csv')
+
+roi = 'low_mumu'
+learning_rate = 0.03
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+bg_choice = 1
+bg_choice_2 = 0
+
+from itertools import combinations
+
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+names_ind_array = np.arange(len(names_sig))
+
+possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))]
+#print(possible_combos)
+print(possible_combos[1])
+#print(possible_combos[-1])
+
+
+length = [len(possible_el) for possible_el in possible_combos]
+print(length)
+
+import random
+sequence_list = np.arange(0,len(names_sig))
+#print(sequence_list)
+random.shuffle(sequence_list)
+print(sequence_list)
+
+interesting_combos = []
+combos = []
+
+for i in range(0, len(length)):
+    #print([len(elem) for elem in possible_combos[i]])
+    for j in range(0, len(possible_combos[i])):
+        #print(list(possible_combos[i][j]))
+        #print(list(sequence_list[:i]))
+        if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])):
+            print(sorted(list(possible_combos[i][j])))
+            print(sorted(list(sequence_list[:(i+1)])))
+            print(i, j)
+            combos.append([i,j])
+            interesting_combos.append(sorted(list(possible_combos[i][j])))
+
+print(combos)
+#for k in range(0,len(combos)):
+    
+print(interesting_combos)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type, var):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf")
+
+def pretty_ROC_Curve_var(results, kind, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(1):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf") 
+
+def pretty_ROC_Curve_var_test_train_val(results, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_train = 'rgba(41, 128, 185, 1.0)'
+    c_line_test = 'rgba(58, 217, 19, 0.8)'
+    c_line_val = 'rgba(244, 70, 10, 0.8)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+    colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val}
+    fig_test = 0
+    fig_train = 0
+    fig_val = 0
+    figs = {'test': fig_test, 'train': fig_train, 'val': fig_val}
+    for kind in ['test', 'val', 'train']:
+        for i in range(1):
+            fpr = results[kind]['fpr'][i]
+            tpr = results[kind]['tpr'][i]
+            interp_tpr = np.interp(fpr_mean, fpr, tpr)
+            interp_tpr[0] = 0.0
+            interp_tprs.append(interp_tpr)
+        tpr_mean = np.mean(interp_tprs, axis = 0)
+        tpr_mean[-1] = 1.0
+        tpr_std = 2*np.std(interp_tprs, axis = 0)
+        tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+        tpr_lower = tpr_mean - tpr_std
+        auc = np.mean(results[kind]['auc'])
+        colour = colours[kind]
+    
+
+        figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}')
+    fig = go.Figure(data = [figs['test'], figs['train'], figs['val']])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black')
+    
+    if not os.path.exists(f"plot/{folder_save}/ROC"):
+        os.mkdir(f"plot/{folder_save}/ROC") 
+
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+    fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf") 
+###############################################################################################################################
+
+for versions in interesting_combos:
+    versions_true = [int(version) for version in versions]
+    versions = [True if value in versions_true else False for value in range(0, len(names_sig))]
+    print(versions)
+    print(np.array(names_sig)[versions])
+    var = np.array(names_sig)[versions]
+    var = [f"{va}_{roi}" for va in var] 
+
+    time = arrow.now().format("YY_MM_DD")
+    plt.style.use(hep.style.ROOT)
+
+
+    df  = df.sample(frac = 1).reset_index(drop=True)
+
+    X = df[list(var)]
+    print(X)
+    print(X.info())
+ 
+    X_signal = df[var][df.target == 1]
+    X_bg = df[var][df.target == 0]
+
+    y = df["target"]
+    print(y)
+
+    y_signal = df["target"][df.target == 1]
+    y_bg = df["target"][df.target == 0]
+
+    from sklearn.impute import SimpleImputer
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import OneHotEncoder
+ 
+    categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+    from sklearn.preprocessing import StandardScaler
+    numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+    cat_cols = X.select_dtypes(exclude = "number").columns
+    num_cols = X.select_dtypes(include = "number").columns
+
+    print(cat_cols)
+    print(num_cols)
+
+    from sklearn.compose import ColumnTransformer
+
+    full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+    X_processed = full_processor.fit_transform(X)
+    y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+    y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1))
+    y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1))
+
+    from sklearn.model_selection import train_test_split
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218)
+    X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218)
+    X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218)
+    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+    print(X_train)
+    print(X_test)
+    print(y_train)
+
+
+
+
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, "full", versions_true)
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, 'zero', versions_true)
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, 'weak', versions_true)
+
+##############################################################################################################################################################
+
+
+    trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+    from sklearn.metrics import accuracy_score
+
+### Init classifier
+    xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10)
+
+### Fit
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    dtest = xgb.DMatrix(X_test, label = y_test)
+    dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig)
+    dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg)
+#print(dtest)
+    dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)])
+    dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):])
+    model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+    model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000,
+    model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000,
+    sets = [dtrain, dval, dtest]
+    results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_weak = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_zero = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+    for i, ds in enumerate(results_new.keys()):
+        print(i)
+        y_preds_new = model_xgb.predict(sets[i])
+        y_preds_new_weak = model_xgb_weak.predict(sets[i])
+        y_preds_new_zero = model_xgb_zero.predict(sets[i])
+        labels_new = sets[i].get_label()
+        fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+        fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak)
+        fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero)
+        results_new[ds]['fpr'].append(fpr_new)
+        results_new[ds]['tpr'].append(tpr_new)
+        results_new[ds]['thresholds'].append(thresholds_new)
+        results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))
+        results_new_weak[ds]['fpr'].append(fpr_new_weak)
+        results_new_weak[ds]['tpr'].append(tpr_new_weak)
+        results_new_weak[ds]['thresholds'].append(thresholds_new_weak)
+        results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) 
+        results_new_zero[ds]['fpr'].append(fpr_new_zero)
+        results_new_zero[ds]['tpr'].append(tpr_new_zero)
+        results_new_zero[ds]['thresholds'].append(thresholds_new_zero)
+        results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero))  
+
+
+
+    pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true)
+
+    pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true)
+
+    xgb_cl.fit(X_train, y_train)
+
+    print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+    preds = xgb_cl.predict(X_test)
+
+    print(accuracy_score(y_test, preds))
+
+    print(y_test)
+    print(model_xgb.predict(dtest))
+    print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+    predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+    predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)])
+    predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)])
+
+    print(accuracy_score(y_test, predict_train))
+
+    from xgboost import plot_importance
+    from xgboost import plot_tree, to_graphviz
+
+    #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+    #importances = importances.sort_values(by = "Importance", ascending = False)
+    #importances = importances.set_index('Feature')
+    #print(importances)
+    #importances.plot.bar()
+    '''
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_{var}.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']
+names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+    '''
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg")
+
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+with open(f"plot/{folder_save}/ROC.txt", "a") as myfile:
+            myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + "  " + '\n')
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py
new file mode 100644
index 0000000..d13dc52
--- /dev/null
+++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py
@@ -0,0 +1,589 @@
+from coffea.util import load
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt, mplhep as hep
+import hist
+import argparse, sys, os, arrow, glob, yaml
+from matplotlib.offsetbox import AnchoredText
+import xgboost as xgb
+from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
+from sklearn.metrics import accuracy_score
+from tqdm.notebook import tqdm
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.model_selection import RepeatedKFold
+import json
+
+
+roi = 'low_ee'
+
+net_path = "/net/scratch_cms3a/vaulin/"
+folder_save = 'eval_23_08_23_2'
+if not os.path.exists(f"./plot/{folder_save}"):
+    os.mkdir(f"./plot/{folder_save}")
+if not os.path.exists(f"./plot/{folder_save}/{roi}"):
+    os.mkdir(f"./plot/{folder_save}/{roi}")
+if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"):
+    os.mkdir(f"./plot/{folder_save}/Diff_ROCs")
+if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs/{roi}"):
+    os.mkdir(f"./plot/{folder_save}/Diff_ROCs/{roi}")
+if not os.path.exists(net_path + f"plot/{folder_save}"):
+    os.mkdir(net_path + f"plot/{folder_save}")
+
+df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv')
+
+
+learning_rate = 0.1
+eta = 0.1
+
+bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"]
+
+from itertools import combinations
+
+names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max',
+                 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead',
+                 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll',
+                 'del_phi_l2_subleading', 'del_phi_l2_leading']
+
+names_ind_array = np.arange(len(names_sig))
+
+possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))]
+#print(possible_combos)
+print(possible_combos[1])
+#print(possible_combos[-1])
+
+
+length = [len(possible_el) for possible_el in possible_combos]
+print(length)
+
+import random
+sequence_list = np.arange(0,len(names_sig))
+#print(sequence_list)
+random.shuffle(sequence_list)
+print(sequence_list)
+
+interesting_combos = []
+combos = []
+
+for i in range(0, len(length)):
+    #print([len(elem) for elem in possible_combos[i]])
+    for j in range(0, len(possible_combos[i])):
+        #print(list(possible_combos[i][j]))
+        #print(list(sequence_list[:i]))
+        if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])):
+            print(sorted(list(possible_combos[i][j])))
+            print(sorted(list(sequence_list[:(i+1)])))
+            print(i, j)
+            combos.append([i,j])
+            interesting_combos.append(sorted(list(possible_combos[i][j])))
+
+print(combos)
+#for k in range(0,len(combos)):
+    
+print(interesting_combos)
+
+interesting_combos.append([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17])
+
+print(interesting_combos)
+
+
+###############################################################################################################################################
+################### Getting ROC curves from json files ########################################################################################
+###############################################################################################################################################
+def convert(x):
+    if hasattr(x, "tolist"):
+        return x.tolist()
+    raise TypeError(x)
+
+
+kind = 'val'
+#kind = 'test'
+#kind = 'train'
+
+def pretty_ROC_Curve(tr_set, kind, type, var):
+
+    with open(tr_set) as user_file:
+        file_contents = user_file.read() 
+ 
+    results = json.loads(file_contents)
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(100):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0.2,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf")
+
+def pretty_ROC_Curve_var(results, kind, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_main = 'rgba(41, 128, 185, 1.0)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+    for i in range(1):
+        fpr = results[kind]['fpr'][i]
+        tpr = results[kind]['tpr'][i]
+        interp_tpr = np.interp(fpr_mean, fpr, tpr)
+        interp_tpr[0] = 0.0
+        interp_tprs.append(interp_tpr)
+    tpr_mean = np.mean(interp_tprs, axis = 0)
+    tpr_mean[-1] = 1.0
+    tpr_std = 2*np.std(interp_tprs, axis = 0)
+    tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+    tpr_lower = tpr_mean - tpr_std
+    auc = np.mean(results[kind]['auc'])
+
+    range_plot_x = [0,1]
+    range_plot_y = [0,1]
+
+    import plotly.graph_objects as go
+
+
+    fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'),
+                 go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'),
+                 go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') 
+
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") 
+
+def pretty_ROC_Curve_var_test_train_val(results, type, var):
+ 
+    results = results
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    
+    c_fill = 'rgba(52, 152, 219, 0.2)'
+    c_line = 'rgba(52, 152, 219, 0.5)'
+    c_line_train = 'rgba(41, 128, 185, 1.0)'
+    c_line_test = 'rgba(58, 217, 19, 0.8)'
+    c_line_val = 'rgba(244, 70, 10, 0.8)'
+    c_grid = 'rgba(189, 195, 199, 0.5)'
+    c_annot = 'rgba(149, 165, 166, 0.5)'
+    c_highlight = 'rgba(192, 57, 43, 1.0)'
+
+    fpr_mean = np.linspace(0, 1, 100)
+
+    interp_tprs = []
+
+    range_plot_x = [0,1]
+    range_plot_y = [0,1]
+
+    import plotly.graph_objects as go
+    colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val}
+    fig_test = 0
+    fig_train = 0
+    fig_val = 0
+    figs = {'test': fig_test, 'train': fig_train, 'val': fig_val}
+    for kind in ['test', 'val', 'train']:
+        for i in range(1):
+            fpr = results[kind]['fpr'][i]
+            tpr = results[kind]['tpr'][i]
+            interp_tpr = np.interp(fpr_mean, fpr, tpr)
+            interp_tpr[0] = 0.0
+            interp_tprs.append(interp_tpr)
+        tpr_mean = np.mean(interp_tprs, axis = 0)
+        tpr_mean[-1] = 1.0
+        tpr_std = 2*np.std(interp_tprs, axis = 0)
+        tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1)
+        tpr_lower = tpr_mean - tpr_std
+        auc = np.mean(results[kind]['auc'])
+        colour = colours[kind]
+    
+
+        figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}')
+    fig = go.Figure(data = [figs['test'], figs['train'], figs['val']])
+
+    fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0)
+    fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,))
+    fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black')
+    fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black')
+    
+    if not os.path.exists(f"plot/{folder_save}/ROC"):
+        os.mkdir(f"plot/{folder_save}/ROC") 
+    if not os.path.exists(f"plot/{folder_save}/ROC/{roi}"):
+        os.mkdir(f"plot/{folder_save}/ROC/{roi}") 
+
+    fig.write_image(f"plot/{folder_save}/ROC/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg")
+    fig.write_image(f"plot/{folder_save}/ROC/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") 
+###############################################################################################################################
+
+for versions in interesting_combos:
+    versions_true = [int(version) for version in versions]
+    versions = [True if value in versions_true else False for value in range(0, len(names_sig))]
+    print(versions)
+    print(np.array(names_sig)[versions])
+    var = np.array(names_sig)[versions]
+    var = [f"{va}_{roi}" for va in var] 
+
+    time = arrow.now().format("YY_MM_DD")
+    plt.style.use(hep.style.ROOT)
+
+
+    df  = df.sample(frac = 1).reset_index(drop=True)
+
+    X = df[list(var)]
+    print(X)
+    print(X.info())
+ 
+    X_signal = df[var][df.target == 1]
+    X_bg = df[var][df.target == 0]
+    X_bg_dy = df[var][df.target_bg == 1]
+    X_bg_zz = df[var][df.target_bg == 2]
+    X_bg_wz = df[var][df.target_bg == 3]
+    X_bg_tt = df[var][df.target_bg == 4]
+    X_bg_zhtobb = df[var][df.target_bg == 5]
+
+    y = df["target"]
+    print(y)
+
+    y_signal = df["target"][df.target == 1]
+    y_bg = df["target"][df.target == 0]
+    y_bg_dy = df["target"][df.target_bg == 1]
+    y_bg_zz = df["target"][df.target_bg == 2]
+    y_bg_wz = df["target"][df.target_bg == 3]
+    y_bg_tt = df["target"][df.target_bg == 4]
+    y_bg_zhtobb = df["target"][df.target_bg == 5]
+
+    from sklearn.impute import SimpleImputer
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import OneHotEncoder
+ 
+    categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),])
+
+    from sklearn.preprocessing import StandardScaler
+    numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())])
+
+    cat_cols = X.select_dtypes(exclude = "number").columns
+    num_cols = X.select_dtypes(include = "number").columns
+
+    print(cat_cols)
+    print(num_cols)
+
+    from sklearn.compose import ColumnTransformer
+
+    full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),])
+
+
+
+    X_processed = full_processor.fit_transform(X)
+    y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1))
+
+    y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1))
+    y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1))
+
+    y_processed_bg_dy = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_dy.values.reshape(-1,1))
+    y_processed_bg_zz = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_zz.values.reshape(-1,1))
+    y_processed_bg_wz = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_wz.values.reshape(-1,1))
+    y_processed_bg_tt = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_tt.values.reshape(-1,1))
+    y_processed_bg_zhtobb = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_zhtobb.values.reshape(-1,1))
+    
+    from sklearn.model_selection import train_test_split
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218)
+    X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218)
+    X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218)
+    X_train_bg_dy, X_test_bg_dy, y_train_bg_dy, y_test_bg_dy = train_test_split(X_bg_dy, y_processed_bg_dy, stratify = y_processed_bg_dy, random_state = 1121218)
+    X_train_bg_zz, X_test_bg_zz, y_train_bg_zz, y_test_bg_zz = train_test_split(X_bg_zz, y_processed_bg_zz, stratify = y_processed_bg_zz, random_state = 1121218)
+    X_train_bg_wz, X_test_bg_wz, y_train_bg_wz, y_test_bg_wz = train_test_split(X_bg_wz, y_processed_bg_wz, stratify = y_processed_bg_wz, random_state = 1121218)
+    X_train_bg_tt, X_test_bg_tt, y_train_bg_tt, y_test_bg_tt = train_test_split(X_bg_tt, y_processed_bg_tt, stratify = y_processed_bg_tt, random_state = 1121218)
+    X_train_bg_zhtobb, X_test_bg_zhtobb, y_train_bg_zhtobb, y_test_bg_zhtobb = train_test_split(X_bg_zhtobb, y_processed_bg_zhtobb, stratify = y_processed_bg_zhtobb, random_state = 1121218)
+    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y)
+    print(X_train)
+    print(X_test)
+    print(y_train)
+
+
+
+
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{roi}_{eta}_bg_full_bg_set.json", kind, "full", versions_true)
+##############################################################################################################################################################
+##################### Zero train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{roi}_{eta}_bg_full_bg_set.json", kind, 'zero', versions_true)
+
+##############################################################################################################################################################
+##################### Weak train ROC #########################################################################################################################
+##############################################################################################################################################################
+
+    pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{roi}_{eta}_bg_full_bg_set.json", kind, 'weak', versions_true)
+
+##############################################################################################################################################################
+
+
+    trials = Trials()
+
+##############################################################################################################################################################
+##################### Initiate the final training to be presented with the best parameters ###################################################################
+##############################################################################################################################################################
+
+    from sklearn.metrics import accuracy_score
+
+### Init classifier
+    xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10)
+
+### Fit
+    params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate}
+    metrics = ['auc', 'fpr', 'tpr', 'thresholds']
+    dtest = xgb.DMatrix(X_test, label = y_test)
+    dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig)
+    dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg)
+    dtest_bg_dy = xgb.DMatrix(X_test_bg_dy, label = y_test_bg_dy)
+    dtest_bg_zz = xgb.DMatrix(X_test_bg_zz, label = y_test_bg_zz)
+    dtest_bg_wz = xgb.DMatrix(X_test_bg_wz, label = y_test_bg_wz)
+    dtest_bg_tt = xgb.DMatrix(X_test_bg_tt, label = y_test_bg_tt)
+    dtest_bg_zhtobb = xgb.DMatrix(X_test_bg_zhtobb, label = y_test_bg_zhtobb)
+#print(dtest)
+    dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)])
+    dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):])
+    model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000,
+    model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000,
+    model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')],
+                  verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000,
+    sets = [dtrain, dval, dtest]
+    results_new = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_weak = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    results_new_zero = {'train': {m:[] for m in metrics},
+           'val': {m:[] for m in metrics},
+           'test': {m:[] for m in metrics}}
+    params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'}
+
+    for i, ds in enumerate(results_new.keys()):
+        print(i)
+        y_preds_new = model_xgb.predict(sets[i])
+        y_preds_new_weak = model_xgb_weak.predict(sets[i])
+        y_preds_new_zero = model_xgb_zero.predict(sets[i])
+        labels_new = sets[i].get_label()
+        fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new)
+        fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak)
+        fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero)
+        results_new[ds]['fpr'].append(fpr_new)
+        results_new[ds]['tpr'].append(tpr_new)
+        results_new[ds]['thresholds'].append(thresholds_new)
+        results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new))
+        results_new_weak[ds]['fpr'].append(fpr_new_weak)
+        results_new_weak[ds]['tpr'].append(tpr_new_weak)
+        results_new_weak[ds]['thresholds'].append(thresholds_new_weak)
+        results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) 
+        results_new_zero[ds]['fpr'].append(fpr_new_zero)
+        results_new_zero[ds]['tpr'].append(tpr_new_zero)
+        results_new_zero[ds]['thresholds'].append(thresholds_new_zero)
+        results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero))  
+
+
+
+    pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true)
+
+    pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true)
+
+    xgb_cl.fit(X_train, y_train)
+
+    print(xgb_cl)
+
+###################################################################################################################################
+################################## Predict and give the final accuracy scores and importance plots ################################
+###################################################################################################################################
+    preds = xgb_cl.predict(X_test)
+
+    print(accuracy_score(y_test, preds))
+
+    print(y_test)
+    print(model_xgb.predict(dtest))
+    print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]))
+    predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])
+    predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)])
+    predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)])
+
+    print(accuracy_score(y_test, predict_train))
+
+    from xgboost import plot_importance
+    from xgboost import plot_tree, to_graphviz
+
+    #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_})
+    #importances = importances.sort_values(by = "Importance", ascending = False)
+    #importances = importances.set_index('Feature')
+    #print(importances)
+    #importances.plot.bar()
+    '''
+fig, ax = plt.subplots(figsize=(17,12))
+plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax)
+plt.xlabel('Feature scores')
+plt.ylabel("Feature names")
+plt.title('Importance plot')
+plt.legend([''])
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_{var}.jpg")
+
+
+feature_importance = model_xgb.get_score(importance_type = 'weight')
+keys = list(feature_importance.keys())
+names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$',
+                 '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet',
+                 '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$',
+                 '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']
+names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] 
+values = list(feature_importance.values())
+data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False)
+print(data)
+print(data.index)
+
+
+fig = plt.figure(figsize=(17,12))
+ax1 = fig.add_subplot(1,2,1)
+ax1.set_axis_off()
+ax2 = fig.add_subplot(1,2,2)
+ax2.barh(list(reversed(data.index)), list(reversed(data.score)))
+ax2.set_xlabel('Feature scores')
+ax2.set_ylabel("Feature names")
+ax2.set_title('Importance plot')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg")
+
+
+plt.figure(figsize=(17,12))
+plot_tree(xgb_cl, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+
+plt.figure(figsize=(17,12))
+plot_tree(model_xgb, fmap = 'feature_map_var.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()
+    '''
+    bins = np.array([0 + i*((1-0)/25) for i in range(0, 26)])
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train), bins = bins, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = bins, edgecolor = 'red', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Signal', 'Background'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg.jpg")
+
+    plt.figure(figsize=(17,12))
+    
+    plt.hist([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))], bins = bins, color=['g', 'y', 'b', 'm', 'c'], stacked = True, alpha = 0.5)
+    plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'red', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['DY', 'ZZ', 'WZ', "tt", "ZHtobb", 'Signal'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_lr_{learning_rate}_{versions_true}_sig_vs_bg_alls.jpg")
+    
+    lumi = 41480
+    xsec_weights = [6077.*(lumi/102863931), 3.74*(lumi/19134840), 6.419*(lumi/18136498), 88.51*(lumi/105859990), 0.00720*(lumi/4337504)]
+    
+    #pred array = [np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))]
+    #print(pred_array)
+    plt.figure(figsize=(17,12))
+    
+    plt.hist([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))], bins = bins, weights = [xsec_weights[i]*np.array([1]*len([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))][i])) for i in range(0,len([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))]))], color=['g', 'y', 'b', 'm', 'c'], stacked = True, alpha = 0.5)
+    plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'red', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['DY', 'ZZ', 'WZ', "tt", "ZHtobb", 'Signal'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_scaled_lr_{learning_rate}_{versions_true}_sig_vs_bg_alls.jpg")
+
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_weak), bins = bins, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_weak.jpg")
+
+    plt.figure(figsize=(17,12))
+    plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False)
+    plt.hist(np.array(predict_train_zero), bins = bins, edgecolor = 'green', hatch = '/', fill = False)
+    plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False)
+    plt.title('Classifier output')
+    plt.legend(['Train output', 'Train output after threshold','Test data'])
+    #plt.show()
+    plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_zero.jpg")
+
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''
+
+with open(f"plot/{folder_save}/ROC.txt", "a") as myfile:
+            myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + "  " + '\n')
+
+'''
+plt.figure(figsize=(17,12))
+to_graphviz(model_xgb, fmap = 'feature_map.txt')
+plt.title('Decision tree graph')
+#plt.show()
+plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800)
+###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1
+#plt.show()'''