From c7d1c0c6ead2e9a5917031158ac7c212b0a4cc97 Mon Sep 17 00:00:00 2001 From: ValVau <109755950+ValVau@users.noreply.github.com> Date: Mon, 8 May 2023 14:59:22 +0200 Subject: [PATCH 1/3] Add files via upload These files are a part of the current VHcc analysis with arrays saving method and BDT training. --- Zll_process_newHist_pandas.py | 1542 +++++++++++++++ ...s_newHist_pandas_small_update_isolation.py | 1687 +++++++++++++++++ cfg_VHcc_mod.py | 163 ++ xgb_test.py | 456 +++++ xgb_test_no_coffea.py | 605 ++++++ xgb_test_only_xgb.py | 361 ++++ xgb_test_only_xgb_no_coffea.py | 399 ++++ xgb_test_only_xgb_reloaded.py | 294 +++ xgb_test_only_xgb_reloaded_no_coffea.py | 287 +++ xgb_test_only_xgb_reloaded_no_coffea_var.py | 404 ++++ 10 files changed, 6198 insertions(+) create mode 100644 Zll_process_newHist_pandas.py create mode 100644 Zll_process_newHist_pandas_small_update_isolation.py create mode 100644 cfg_VHcc_mod.py create mode 100644 xgb_test.py create mode 100644 xgb_test_no_coffea.py create mode 100644 xgb_test_only_xgb.py create mode 100644 xgb_test_only_xgb_no_coffea.py create mode 100644 xgb_test_only_xgb_reloaded.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_var.py diff --git a/Zll_process_newHist_pandas.py b/Zll_process_newHist_pandas.py new file mode 100644 index 0000000..a35a4d4 --- /dev/null +++ b/Zll_process_newHist_pandas.py @@ -0,0 +1,1542 @@ +import csv +from curses import meta +from dataclasses import dataclass +import gzip +import pickle, os, sys, mplhep as hep, numpy as np +from select import select + +import json + +#from coffea import hist, processor # ToDo: move to the better hist +from coffea import processor # ToDo: move to the better hist +import hist +from hist import Hist +from coffea.nanoevents.methods import vector +import awkward as ak +from VHcc.utils.correction import jec,muSFs,eleSFs,init_corr +from coffea.lumi_tools import LumiMask +from coffea.analysis_tools import Weights, PackedSelection +from functools import partial +# import numba +from VHcc.helpers.util import reduce_and, reduce_or, nano_mask_or, get_ht, normalize, make_p4 + +def empty_column_accumulator(): + #return processor.column_accumulator(np.array([],dtype=object)) + return processor.column_accumulator(np.array([],dtype=np.float64)) +def array_accumulator(): + return processor.defaultdict_accumulator(empty_column_accumulator) + +def mT(obj1,obj2): + return np.sqrt(2.*obj1.pt*obj2.pt*(1.-np.cos(obj1.phi-obj2.phi))) +def flatten(ar): # flatten awkward into a 1d array to hist + return ak.flatten(ar, axis=None) +def normalize(val, cut): + if cut is None: + ar = ak.to_numpy(ak.fill_none(val, np.nan)) + return ar + else: + ar = ak.to_numpy(ak.fill_none(val[cut], np.nan)) + return ar + +def read_json(path): + f = open(path) + data = json.load(f) + return data + +def dataset_name_to_number(dataset, year): + samples_path = 'src/VHcc/metadata/sample_info_' + year + '_reversed' + + samples = read_json(samples_path+'.json') + + return samples[dataset]['type'], samples[dataset]['doJetFlavorSplit'] + +def dataset_categories(year): + map_path = 'src/VHcc/metadata/mergemap_' + year + '_Zll' + + samples = read_json(map_path+'.json').values() + all_datasets = [item for sublist in samples for item in sublist] + + return all_datasets + +def get_info_dict(year): + with open(f'src/VHcc/metadata/sample_info_{year}.json') as si: + info = json.load(si) + info_dict={} + for obj in info: + #print(obj) + info_dict[obj]=info[obj]['name'] + return info_dict + +class NanoProcessor(processor.ProcessorABC): + def __init__(self, cfg): + self.cfg = cfg + self._year = self.cfg.dataset["year"] + self._campaign = self.cfg.dataset["campaign"] + + self._version=self.cfg.userconfig['version'] # only because the new runner etc. needs that, not used later + self._export_array = True # if 'test' in self._version else False + self._debug = False #True + + # paths from table 1 and 2 of the AN_2020_235 + + # l l + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3230-L3337 + self._mumu_hlt = { + '2016': [ + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + 'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL', + 'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ' + ], + '2017': [ + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1 + ], + '2018': [ + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 but this is the only used one in 2018?! + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1 + ], + } + + self._ee_hlt = { + '2016': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' + ], + '2017': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL', + #'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' # not in VHccAnalysis code + ], + '2018': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL' + ], + } + + ''' + # l nu + self._munu_hlt = { + '2016': [ + 'IsoMu24', + 'IsoTkMu24' + ], + '2017': [ + 'IsoMu24', + 'IsoMu27' + ], + '2018': [ + 'IsoMu24', + 'IsoMu27' + ], + } + + self._enu_hlt = { + '2016': [ + 'Ele27_eta2p1_WPTight_Gsf' + ], + '2017': [ + 'Ele32_WPTight_Gsf_L1DoubleEG', + 'Ele32_WPTight_Gsf' + ], + '2018': [ + 'Ele32_WPTight_Gsf_L1DoubleEG', + 'Ele32_WPTight_Gsf'#allowMissingBranch=1 + ], + } + + # nu nu + self._nunu_hlt = { + '2016': [ + 'PFMET110_PFMHT110_IDTight', + #'PFMET110_PFMHT120_IDTight', # found in hltbranches_2016.txt but not in AN, maybe redundant? + 'PFMET170_NoiseCleaned',#allowMissingBranch=1 + 'PFMET170_BeamHaloCleaned',#allowMissingBranch=1 + 'PFMET170_HBHECleaned' + ], + '2017': [ + 'PFMET110_PFMHT110_IDTight', + 'PFMET120_PFMHT120_IDTight', + 'PFMET120_PFMHT120_IDTight_PFHT60',#allowMissingBranch=1 + 'PFMETTypeOne120_PFMHT120_IDTight' + ], + '2018': [ + 'PFMET110_PFMHT110_IDTight', + 'PFMET120_PFMHT120_IDTight', + 'PFMET120_PFMHT120_IDTight_PFHT60'#allowMissingBranch=1 + ], + } + + ''' + + # differences between UL and EOY + # see https://twiki.cern.ch/twiki/bin/view/CMS/MissingETOptionalFiltersRun2 + # also look at sec. 3.7.2 + self._met_filters = { + '2016': { + 'data': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + 'eeBadScFilter', + ], + 'mc': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'eeBadScFilter', # not suggested in EOY MC + ], + }, + '2017': { + "data": [ + "goodVertices", + "globalSuperTightHalo2016Filter", + "HBHENoiseFilter", + "HBHENoiseIsoFilter", + "EcalDeadCellTriggerPrimitiveFilter", + "BadPFMuonFilter", + "BadPFMuonDzFilter", + "hfNoisyHitsFilter", + "eeBadScFilter", + "ecalBadCalibFilter", + ], + "mc": [ + "goodVertices", + "globalSuperTightHalo2016Filter", + "HBHENoiseFilter", + "HBHENoiseIsoFilter", + "EcalDeadCellTriggerPrimitiveFilter", + "BadPFMuonFilter", + "BadPFMuonDzFilter", + "hfNoisyHitsFilter", + "eeBadScFilter", + "ecalBadCalibFilter", + ], + }, + '2018': { + 'data': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'hfNoisyHitsFilter', # not in EOY + 'eeBadScFilter', + 'ecalBadCalibFilterV2', + ], + 'mc': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'hfNoisyHitsFilter', # not in EOY + #'eeBadScFilter', # not suggested in EOY MC + 'ecalBadCalibFilterV2', + ], + }, + } + + # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/crab/crab_all.py#L33-36 + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions16/13TeV/ReReco/Final/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt' + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions17/13TeV/ReReco/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt' + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions18/13TeV/ReReco/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt' + # downloaded. + self._lumiMasks = { + '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'), + '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt'), + '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt') + } + + self._corr = init_corr(self._year) + + # Axes: Cat - what it is, a type of something, described with words + # Bin - how much of something, numerical things + # + # --> Some axes are already connected to specific objetcs, or to the event + # --> Others are "building-blocks" that can be reused multiple times + + list_of_datasets = dataset_categories(self._year) + #print(list_of_datasets) + #sys.exit() + # Define axes + # Should read axes from NanoAOD config / metadata + #dataset_axis = hist.Cat("dataset", "Primary dataset") + dataset_axis = hist.axis.StrCategory([], name="dataset", label="Primary dataset", growth=True) + # split V+jets sample & VZ signal, this is per event + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2276 + #datasetSplit_axis = hist.Cat("datasetSplit", "Dataset split by flav", list_of_datasets) + datasetSplit_axis = hist.axis.StrCategory(list_of_datasets, name="datasetSplit", label="Dataset split by flav") + + # use hadronFlavour, necessary when applying btag scale factors (that depend on flavour) + # this one will be done per jet, can have values 0, 4, 5 + #flav_axis = hist.Bin("flav", r"hadronFlavour",[0,1,4,5,6]) + flav_axis = hist.axis.Variable([0,1,4,5,6], name="flav", label="hadronFlavour") + + #lepflav_axis = hist.Cat("lepflav",['ee','mumu']) + lepflav_axis = hist.axis.StrCategory(['ee','mumu'], name="lepflav", label="Lepton flav") + + regions = ['SR_2LL','SR_2LH', + 'CR_Zcc_2LL','CR_Zcc_2LH', + 'CR_Z_LF_2LL','CR_Z_LF_2LH', + 'CR_Z_HF_2LL','CR_Z_HF_2LH', + 'CR_t_tbar_2LL','CR_t_tbar_2LH'] + #region_axis = hist.Cat("region",regions) + region_axis = hist.axis.StrCategory(regions, name="region", label="Region") + + # Events + njet_axis = hist.axis.Regular(13, -.5, 12.5, name="nj", label="N jets") #hist.Bin("nj", r"N jets", 13, -.5, 12.5) + + nAddJets_axis = hist.axis.Regular(11, -.5, 10.5, name="nAddJets302p5_puid", label="N additional jets") + #hist.Bin("nAddJets302p5_puid", r"N additional jets", 11, -.5, 10.5) + nAddJets_FSRsub_axis = hist.axis.Regular(11, -.5, 10.5, name="nAddJetsFSRsub302p5_puid", label="N additional jets (FSR subtracted)") + #hist.Bin("nAddJetsFSRsub302p5_puid", r"N additional jets (FSR subtracted)", 11, -.5, 10.5) + + #nbjet_axis = hist.Bin("nbj", r"N b jets", [0,1,2,3,4,5]) + #ncjet_axis = hist.Bin("ncj", r"N c jets", [0,1,2,3,4,5]) + # kinematic variables + pt_axis = hist.axis.Regular(50, 0, 300, name="pt", label=r"$p_{T}$ [GeV]") + #hist.Bin("pt", r" $p_{T}$ [GeV]", 50, 0, 300) + eta_axis = hist.axis.Regular(25, -2.5, 2.5, name="eta", label=r"$\eta$") + #hist.Bin("eta", r" $\eta$", 25, -2.5, 2.5) + phi_axis = hist.axis.Regular(30, -3, 3, name="phi", label=r"$\phi$") + #hist.Bin("phi", r" $\phi$", 30, -3, 3) + mass_axis = hist.axis.Regular(50, 0, 300, name="mass", label=r"$m$ [GeV]") + #hist.Bin("mass", r" $m$ [GeV]", 50, 0, 300) + mt_axis = hist.axis.Regular(30, 0, 300, name="mt", label=r"$m_{T}$ [GeV]") + #hist.Bin("mt", r" $m_{T}$ [GeV]", 30, 0, 300) + dr_axis = hist.axis.Regular(20, 0, 5, name="dr", label=r"$\Delta$R") + #hist.Bin("dr","$\Delta$R",20,0,5) + + # some more variables to check, which enter BDT + # need to revisit this later, because high Vpt and low Vpt can have different binning + jjVPtRatio_axis = hist.axis.Regular(15, 0, 2, name="jjVPtRatio", label=r"$p_{T}(jj) / $p_{T}(V)$ [GeV]") + #hist.Bin("jjVPtRatio",r"$p_{T}(jj) / $p_{T}(V)$ [GeV]",15,0,2) + + + #dphi_V_H_axis = hist.Bin("dphi_V_H","$\Delta\Phi(V, H)$",20,0,3.2) + # jet jet + #dr_j1_j2_axis = hist.Bin("dr_j1_j2","$\Delta R(j1,j2)$",20,0,5) + # jet jet + #dphi_j1_j2_axis = hist.Bin("dphi_j1_j2","$\Delta\Phi(j1,j2)$",15,-3.2,3.2) + #deta_j1_j2_axis = hist.Bin("deta_j1_j2","$\Delta\eta(j1,j2)$",15,0,3) + # lepton lepton + #dphi_l1_l2_axis = hist.Bin("dphi_l1_l2","$\Delta\Phi(l1,l2)$",15,0,3.2) + #deta_l1_l2_axis = hist.Bin("eta_l1_l2","$\Delta\eta(l1,l2)$",15,0,2.6) + # jet lepton + #dphi_j1_l1_axis = hist.Bin("dphi_j1_l1","$\Delta\Phi(j1,l1)$",15,0,3.2) + #dphi_j2_l1_axis = hist.Bin("dphi_j2_l1","$\Delta\Phi(j2,l1)$",15,0,3.2) + #dphi_j1_l2_axis = hist.Bin("dphi_j1_l2","$\Delta\Phi(j1,l2)$",15,0,3.2) + #dphi_j2_l2_axis = hist.Bin("dphi_j2_l2","$\Delta\Phi(j2,l2)$",15,0,3.2) + + # ToDo: several other variables can only be stored after kinfit + # e.g. here https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/python/kinfitter.py + # or https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L4670-L4995 + + # weights are interesting as well + weight_axis = hist.axis.Regular(100, -5.001, 5.001, name="weight_full", label="weight_full") + #hist.Bin("weight_full","weight_full",100,-5.001, 5.001) + genweight_axis = hist.axis.Regular(100, -0.001, 0.001, name="genWeight", label="genWeight") + #hist.Bin("genWeight","genWeight",100,-0.001, 0.001) + sign_genweight_axis = hist.axis.Regular(100, -1.001, 1.001, name="genWeight_by_abs", label="genWeight/abs(genWeight)") + #hist.Bin("genWeight_by_abs","genWeight/abs(genWeight)",100,-1.001,1.001) + + + # MET vars + #signi_axis = hist.Bin("significance", r"MET $\sigma$",20,0,10) + #covXX_axis = hist.Bin("covXX",r"MET covXX",20,0,10) + #covXY_axis = hist.Bin("covXY",r"MET covXY",20,0,10) + #covYY_axis = hist.Bin("covYY",r"MET covYY",20,0,10) + #sumEt_axis = hist.Bin("sumEt", r" MET sumEt", 50, 0, 300) + + # ToDo: switch to this + # axis.StrCategory([], name='region', growth=True), + #disc_list = [ 'btagDeepCvL', 'btagDeepCvB','btagDeepFlavCvB','btagDeepFlavCvL']#,'particleNetAK4_CvL','particleNetAK4_CvB'] + # As far as I can tell, we only need DeepFlav currently + ### In all of the older stuff use: + #disc_list = ['btagDeepFlavC','btagDeepFlavB','btagDeepFlavCvL','btagDeepFlavCvB'] + ### With new stuff UL I use, use: + disc_list = ['btagDeepFlavCvL','btagDeepFlavCvB'] + btag_axes = [] + for d in disc_list: + # technically, -1 values are possible, but probably unlikely to matter much after event selection + btag_axes.append(hist.axis.Regular(20, 0, 1, name=d, label=d) + #hist.Bin(d, d , 20, 0, 1) + ) + #h = ( + # Hist.new.Reg(10, -5, 5, overflow=False, underflow=False, name="A") + # .Bool(name="B") + # .Var(range(10), name="C") + # .Int(-5, 5, overflow=False, underflow=False, name="D") + # .IntCat(range(10), name="E") + # .StrCat(["T", "F"], name="F") + # .Double() + #) + #print(type(dataset_axis)) + #print(type(lepflav_axis)) + #print(type(flav_axis)) + #print(type(njet_axis)) + #print(type(hist.storage.Weight())) + #testHistA = Hist(dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, njet_axis, hist.storage.Weight()) + #testHist = Hist( + # #dataset_axis, + # #datasetSplit_axis, + # #lepflav_axis, + # #region_axis, + # njet_axis, + # hist.storage.Weight() + # ) + _hist_event_dict = { + 'nj' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + njet_axis, hist.storage.Weight()), + 'nAddJets302p5_puid' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + nAddJets_axis, hist.storage.Weight()), + 'nAddJetsFSRsub302p5_puid' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + nAddJets_FSRsub_axis, hist.storage.Weight()), + # 'weight_full' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # weight_axis, hist.storage.Weight()), + # 'genweight' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # genweight_axis, hist.storage.Weight()), + # 'sign_genweight' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # sign_genweight_axis, hist.storage.Weight()), + 'jjVPtRatio' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + jjVPtRatio_axis, hist.storage.Weight()) + + #'dphi_V_H' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dphi_V_H_axis) + #'dr_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dr_j1_j2_axis) + #'dphi_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_j2_axis) + #'deta_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, deta_j1_j2_axis) + #'dphi_l1_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_l1_l2_axis) + #'dphi_j1_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_l2_axis) + #'dphi_j2_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j2_l2_axis) + + #'sampleFlavSplit' : Hist( dataset_axis, lepflav_axis, region_axis, sampleFlavSplit_axis), + #'nbj' : Hist( dataset_axis, lepflav_axis, region_axis, nbjet_axis), + #'ncj' : Hist( dataset_axis, lepflav_axis, region_axis, ncjet_axis), + #'hj_dr' : Hist( dataset_axis, lepflav_axis, region_axis, dr_axis), + #'MET_sumEt' : Hist( dataset_axis, lepflav_axis, region_axis, sumEt_axis), + #'MET_significance' : Hist( dataset_axis, lepflav_axis, region_axis, signi_axis), + #'MET_covXX' : Hist( dataset_axis, lepflav_axis, region_axis, covXX_axis), + #'MET_covXY' : Hist( dataset_axis, lepflav_axis, region_axis, covXY_axis), + #'MET_covYY' : Hist( dataset_axis, lepflav_axis, region_axis, covYY_axis), + #'MET_phi' : Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'MET_pt' : Hist( dataset_axis, lepflav_axis, region_axis, pt_axis), + #'mT1' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'mT2' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'mTh':Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'dphi_lep1':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'dphi_lep2':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'dphi_ll':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + } + + # jets will be ordered by DeepJet (which is DeepFlav for historical reasons) + objects=['leading_jetflav','subleading_jetflav','lep1','lep2','ll','jj'] + + for i in objects: + # distinguish between jets and other objects, as the structure for jets contains additional flavour axis + if 'jet' in i: + _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + pt_axis, hist.storage.Weight()) + _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + eta_axis, hist.storage.Weight()) + _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + phi_axis, hist.storage.Weight()) + _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + mass_axis, hist.storage.Weight()) + else: + _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + pt_axis, hist.storage.Weight()) + _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + eta_axis, hist.storage.Weight()) + _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + phi_axis, hist.storage.Weight()) + _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + mass_axis, hist.storage.Weight()) + + # more information on the discriminators is stored for the first two jets, + # ordered by DeepJet CvL discriminator and called "leading" and "subleading" + for disc, axis in zip(disc_list,btag_axes): + _hist_event_dict["leading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + axis, hist.storage.Weight()) + _hist_event_dict["subleading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + axis, hist.storage.Weight()) + + self.event_hists = list(_hist_event_dict.keys()) + + # this can be used to not only store histograms, but also features on a per-event basis (arrays) + if self._export_array: + _hist_event_dict['array'] = processor.defaultdict_accumulator(array_accumulator) + #self._accumulator = processor.dict_accumulator( + # {**_hist_event_dict, + # #'cutflow': processor.defaultdict_accumulator( + # # partial(processor.defaultdict_accumulator, int)) + # }) + #self._accumulator['sumw'] = processor.defaultdict_accumulator(float) + + #self._accumulator = processor.dict_accumulator( + # { + # observable: Hist.Hist(var_axis, name="Counts", storage="Weight") + # for observable, var_axis in axis.items() + # if observable != "dataset" + # } + #) + #self._accumulator["cutflow"] = processor.defaultdict_accumulator( + # partial(processor.defaultdict_accumulator, int) + #) + #self._accumulator["sumw"] = 0 + + self.make_output = lambda: { + "cutflow": processor.defaultdict_accumulator( + partial(processor.defaultdict_accumulator, int) + ), + "sumw": 0, + **_hist_event_dict + } + + + @property + def accumulator(self): + return self._accumulator + + def process(self, events): + #output = self.accumulator #.identity() + output = self.make_output() + dataset = events.metadata['dataset'] + start = events.metadata['entrystart'] + stop = events.metadata['entrystop'] + output_location_list = [] + filename = events.metadata['filename'].split('/')[-1].strip('.root') + #print(dataset) + # Q: could there be MC that does not have this attribute? Or is it always the case? + isRealData = not hasattr(events, "genWeight") + + # Done (externally): map from the lengthy dataset (path) to a more readable name + # Keep the long name only for data, because it contains the Run info (necessary to apply corrections) + if isRealData: + info_dict = get_info_dict(self._year) + dataset_long = dataset + dictname = dataset[1:].split('/')[0] + dataset = info_dict[dictname] + print(dataset) + sample_type, doFlavSplit = dataset_name_to_number(dataset, self._year) + # length of events is used so many times later on, probably useful to just save it here and then refer to that + nEvents = len(events) + print('Number of events: ', nEvents) + + # As far as I understand, this looks like a neat way to give selections a name, + # while internally, there are boolean arrays for all events + selection = PackedSelection() + + + # this is either counting events in data with weight 1, or weighted (MC) + if isRealData: + output['sumw'] += nEvents + else: + # instead of taking the weights themselves, the sign is used: + # https://cms-talk.web.cern.ch/t/huge-event-weights-in-dy-powhegminnlo/8718/7 + # although I initially had the same concerns as those raised in the thread, + # if not only the sign is different, but also the absolute values between events + # somehow it seems to average out, although I don't see why this is guaranteed + # must have to do with "LO without interference" where the values are indeed same + # and if they are not same, the differences are consired to be negligible + output['sumw'] += ak.sum(events.genWeight/abs(events.genWeight)) + + + req_lumi=np.ones(nEvents, dtype='bool') + if isRealData: + req_lumi=self._lumiMasks[self._year](events.run, events.luminosityBlock) + selection.add('lumi',ak.to_numpy(req_lumi)) + del req_lumi + + + # AS: sort of the same thing as above, but now per entry + weights = Weights(nEvents, storeIndividual=True) + if isRealData: + weights.add('genweight',np.ones(nEvents)) + else: + weights.add('genweight',events.genWeight/abs(events.genWeight)) + # weights.add('puweight', compiled['2017_pileupweight'](events.Pileup.nPU)) + + + ############## + if isRealData: + output['cutflow'][dataset]['all'] += nEvents + output['cutflow'][dataset]['all (weight 1)'] += nEvents + else: + output['cutflow'][dataset]['all'] += ak.sum(events.genWeight/abs(events.genWeight)) + output['cutflow'][dataset]['all (weight 1)'] += nEvents + + + #trigger_met = np.zeros(nEvents, dtype='bool') + + trigger_ee = np.zeros(nEvents, dtype='bool') + trigger_mm = np.zeros(nEvents, dtype='bool') + + #trigger_e = np.zeros(nEvents, dtype='bool') + #trigger_m = np.zeros(nEvents, dtype='bool') + + #for t in self._nunu_hlt[self._year]: + # # so that already seems to be the check for whether the path exists in the file or not + # if t in events.HLT.fields: + # trigger_met = trigger_met | events.HLT[t] + + for t in self._mumu_hlt[self._year]: + if t in events.HLT.fields: + trigger_mm = trigger_mm | events.HLT[t] + + for t in self._ee_hlt[self._year]: + if t in events.HLT.fields: + trigger_ee = trigger_ee | events.HLT[t] + + #for t in self._munu_hlt[self._year]: + # if t in events.HLT.fields: + # trigger_m = trigger_m | events.HLT[t] + + #for t in self._emu_hlt[self._year]: + # if t in events.HLT.fields: + # trigger_e = trigger_e | events.HLT[t] + + + selection.add('trigger_ee', ak.to_numpy(trigger_ee)) + selection.add('trigger_mumu', ak.to_numpy(trigger_mm)) + + + # apart from the comments above about EOY/UL, should be fine + metfilter = np.ones(nEvents, dtype='bool') + for flag in self._met_filters[self._year]['data' if isRealData else 'mc']: + metfilter &= np.array(events.Flag[flag]) + selection.add('metfilter', metfilter) + del metfilter + + + + # Not strictly necessary for Zll + met = ak.zip({ + "pt": events.MET.pt, + "phi": events.MET.phi, + "energy": events.MET.sumEt, + }, with_name="PtEtaPhiMLorentzVector" + ) + + + + split_by_flav = False + sampleFlavSplit = np.zeros(nEvents) + possible_flavSplits = ['already_split_sample'] + selection.add('already_split_sample',sampleFlavSplit == 0) + if not isRealData and not self._debug: + if doFlavSplit == '1' and not (int(sample_type) >= 27 and int(sample_type) <= 39): + split_by_flav = True + # uses the same naming scheme as AT, although udbsg is counterintuitive (b? [sic!]) + possible_flavSplits = ['_cc','_bb','_bc','_cl','_bl','_udbsg'] + # ================================================================================= + # + # # Split V+jets BG by flavour, via GenJet + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2228 + gen_jet = events.GenJet + + cGenJetTot = ak.sum((gen_jet.hadronFlavour == 4) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1) + bGenJetTot = ak.sum((gen_jet.hadronFlavour == 5) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1) + + tag_cc = cGenJetTot >= 2 + tag_bb = bGenJetTot >= 2 + tag_bc = (bGenJetTot == 1) & (cGenJetTot == 1) + tag_cl = (cGenJetTot == 1) & (bGenJetTot == 0) + tag_bl = (bGenJetTot == 1) & (cGenJetTot == 0) + tag_ll = (cGenJetTot == 0) & (bGenJetTot == 0) + + sampleFlavSplit = 1 * tag_cc + 2 * tag_bb + 3 * tag_bc + 4 * tag_cl + 5 * tag_bl + 6 * tag_ll + selection.add('_cc',sampleFlavSplit == 1) + selection.add('_bb',sampleFlavSplit == 2) + selection.add('_bc',sampleFlavSplit == 3) + selection.add('_cl',sampleFlavSplit == 4) + selection.add('_bl',sampleFlavSplit == 5) + selection.add('_udbsg',sampleFlavSplit == 6) # tbf I don't know why it contains b + + #elif dataset in ['WZTo1L1Nu2Q', 'ZZTo2L2Q', 'ZZTo2Q2Nu']: # VZ signal datasets + elif int(sample_type) in [32,36,37]: # VZ signal datasets + split_by_flav = True + possible_flavSplits = ['cc','bb','ll'] + # ================================================================================= + # + # # Split VZ signal by flavour, via GenPart + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2229-L2264 + gen_part = events.GenPart + + + Z_decay_mothers_A = (abs(gen_part.pdgId) == 23) & (gen_part.hasFlags('isLastCopy')) + + Z_decays = gen_part[Z_decay_mothers_A] + output['cutflow'][dataset]['GenPart VZ signal'] += ak.sum(Z_decay_mothers_A) + + n_b_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 5, axis=-1), axis=-1) + n_c_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 4, axis=-1), axis=-1) + + + + VZ_cc = (n_c_from_Z >= 2) + VZ_bb = (n_b_from_Z >= 2) + VZ_others = (~VZ_cc) & (~VZ_bb) + # 1, 2 and 3 identical to what was done in AnalysisTools! Do not confuse with BTV / hadron / parton flavour... + sampleFlavSplit = 1 * VZ_cc + 2 * VZ_bb + 3 * VZ_others + + #print(sampleFlavSplit.type) + + selection.add('cc',sampleFlavSplit == 1) + selection.add('bb',sampleFlavSplit == 2) + selection.add('ll',sampleFlavSplit == 3) + + elif int(sample_type) in [27,28,29,30,31,33,34,35,38,39]: + possible_flavSplits = ['ll'] + sampleFlavSplit = sampleFlavSplit + 3 + selection.add('ll',sampleFlavSplit == 3) + split_by_flav = True + + # this is how it looked in AT for comparison: + ''' + else if( cursample->doJetFlavorSplit + && ( mInt("sampleIndex")==27 || mInt("sampleIndex")==28 + || mInt("sampleIndex")==29 || mInt("sampleIndex")==30 + || mInt("sampleIndex")==31 || mInt("sampleIndex")==33 + || mInt("sampleIndex")==34 || mInt("sampleIndex")==35 + || mInt("sampleIndex")==38 || mInt("sampleIndex")==39 + ) + ){ + *in["sampleIndex"] = mInt("sampleIndex")*100 + 3; + ''' + + + + + + # ================================================================================= + # + # # Reconstruct and preselect leptons + # + # --------------------------------------------------------------------------------- + + + # Adopt from https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440 + # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/VHccProducer.py#L345-389 + + # ## Muon cuts + ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 + #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)] + event_mu = events.Muon + # looseId >= 1 or looseId seems to be the same... + musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25)) + # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L + event_mu = event_mu[musel] + event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)] + event_mu["lep_flav"] = 13*event_mu.charge + event_mu= ak.pad_none(event_mu,2,axis=1) + nmu = ak.sum(musel,axis=1) + # ToDo: PtCorrGeoFit + + # ## Electron cuts + ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2 + #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)] + event_e = events.Electron + elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25)) + # but 30GeV and WP80 for 1L + event_e = event_e[elesel] + # something I saw in a recent presentation, and also in AT code: + # https://indico.desy.de/event/34473/contributions/122201/attachments/76587/98753/RTG_Meeting_01_09_22.pdf + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/VHccAnalysis/PlotWithVarial/ZllHccLowPt.py#L256-L257 + # is to require "good electrons", which means excluding some region (eta), + # I guess it has sth to do with transition between barrel / endcap? + event_e = event_e[(abs(event_e.eta) > 1.5660) | (abs(event_e.eta) < 1.4442)] + event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)] + event_e["lep_flav"] = 11*event_e.charge + event_e = ak.pad_none(event_e,2,axis=1) + nele = ak.sum(elesel,axis=1) + # sorting after selecting should be faster (less computations on average) + + # for this channel (Zll / 2L) + selection.add('lepsel',ak.to_numpy((nele==2)|(nmu==2))) + + + + #### build lepton pair(s) + good_leptons = ak.with_name( + ak.concatenate([ event_e, event_mu], axis=1), + "PtEtaPhiMCandidate", ) + good_leptons = good_leptons[ak.argsort(good_leptons.pt, axis=1,ascending=False)] + leppair = ak.combinations( + good_leptons, + n=2, + replacement=False, + axis=-1, + fields=["lep1", "lep2"], + ) + + ll_cand = ak.zip({ + "lep1" : leppair.lep1, + "lep2" : leppair.lep2, + "pt": (leppair.lep1+leppair.lep2).pt, + "eta": (leppair.lep1+leppair.lep2).eta, + "phi": (leppair.lep1+leppair.lep2).phi, + "mass": (leppair.lep1+leppair.lep2).mass, + }, with_name="PtEtaPhiMLorentzVector" + ) + # probably there needs to be a cross-check that we don't include more than we want here, + # I know there is the option to truncate the array if more than 1 is found + # --> clip = True + ll_cand = ak.pad_none(ll_cand,1,axis=1) + + # there seem to be multiple ways to get the "one" ll_cand of interest + # - closest to Z-mass [makes sense] + # I think others use this + # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases] + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440 + + if (ak.count(ll_cand.pt)>0): + ll_cand = ll_cand[ak.argsort(ll_cand.pt, axis=1,ascending=False)] + # try the second option here + # NOTE: Comment out to debug stuff + ll_cand = ll_cand[:, 0] + + + + # ================================================================================= + # + # # Reconstruct and preselect jets + # + # --------------------------------------------------------------------------------- + + # Apply correction: + if isRealData: + #print(dataset_long) + jets = jec(events,events.Jet,dataset_long,self._year,self._corr) + else: + jets = jec(events,events.Jet,dataset,self._year,self._corr) + #jets = events.Jet + + # This was necessary for the FSR code + #jets = jets.mask[ak.num(jets) > 2] + + + + # For EOY: recalculate CvL & CvB here, because the branch does not exist in older files + # adapted from PostProcessor + def deepflavcvsltag(jet): + btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB) + return ak.where((jet.btagDeepFlavB >= 0.) & (jet.btagDeepFlavB < 1.) & (jet.btagDeepFlavC >= 0.) & (btagDeepFlavL >= 0.), + jet.btagDeepFlavC/(1.-jet.btagDeepFlavB), + (-1.) * ak.ones_like(jet.btagDeepFlavB)) + + def deepflavcvsbtag(jet): + btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB) + return ak.where((jet.btagDeepFlavB > 0.) & (jet.btagDeepFlavC > 0.) & (btagDeepFlavL >= 0.), + jet.btagDeepFlavC/(jet.btagDeepFlavC+jet.btagDeepFlavB), + (-1.) * ak.ones_like(jet.btagDeepFlavB)) + + # Alternative ways: + # - depending on the Nano version, there might already be bTagDeepFlavCvL available + # - one could instead use DeepCSV via bTagDeepCvL + # - not necessarily use CvL, other combination possible ( CvB | pt | BDT? ) + + #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets) + #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets) + jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)] + + + # Jets are considered only if the following identification conditions hold, as mentioned in AN + # - Here is some documentation related to puId and jetId: + # https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJetID + # https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID + jet_conditions = ((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ + | ((jets.pt>50) & (jets.jetId>5)) + # Count how many jets exist that pass this selection + njet = ak.sum(jet_conditions,axis=1) + selection.add('jetsel',ak.to_numpy(njet>=2)) + + + # ================================================================================= + # + # # FSR recovery + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L841-L956 + + # FSR jets are selected with slightly different criteria + fsr_conditions = (abs(jets.eta) < 3) & (jets.pt > 20) \ + & ak.all(jets.metric_table(ll_cand.lep1)>0.2) & ak.all(jets.metric_table(ll_cand.lep2)>0.2) + # Take the first two jets that pass the criteria and check the remaining ones, + # as well as potentially others, to get FSR jets: + pick2 = jets[ak.pad_none(ak.local_index(jets, 1)[jet_conditions], 2)[:, :2]] + others = jets[ak.concatenate([ak.pad_none(ak.local_index(jets, 1)[(jet_conditions) & (fsr_conditions)], 2)[:, 2:], + ak.local_index(jets, 1)[(~jet_conditions) & (fsr_conditions)] + ], axis=1)] + + + def find_fsr(leading, subleading, others, threshold=0.8): + mval1, (a1, b) = leading.metric_table(others, return_combinations=True) + mval2, (a2, b) = subleading.metric_table(others, return_combinations=True) + + def res(mval, out): + order = ak.argsort(mval, axis=-1) + return out[order], mval[order] + + out1, metric1 = res(mval1, b) + out2, metric2 = res(mval2, b) + + out1 = out1.mask[(metric1 <= threshold) & (metric1 < metric2)] + out2 = out2.mask[(metric2 <= threshold) & (metric2 < metric1)] + #out2 = out2.mask[(metric1 <= threshold) & (metric2 < metric1)] + return out1[:, 0, ...], out2[:, 0, ...] + + + missing = ~(ak.is_none(pick2[:, 0]) | ak.is_none(pick2[:, 1])) + pick2 = pick2.mask[missing] + others = others.mask[missing] + + + leading, subleading = pick2[:, 0], pick2[:, 1] + fsr_leading, fsr_subleading = find_fsr(leading, subleading, others, threshold=0.8) + + #print(leading.pt) + #print((leading + fsr_leading.sum()).pt) + + # To explicitly check that adding FSR does indeed have an effect + #print(ak.sum((leading + fsr_leading.sum()).pt != leading.pt)) + + #print(leading.type) + + # Collect the (sub-)leading jets and their respective FSR jets in a new 4-vector + leading_with_fsr = ak.zip({ + "jet1" : leading, + "jet2" : fsr_leading.sum(), + "pt": (leading + fsr_leading.sum()).pt, + "eta": (leading + fsr_leading.sum()).eta, + "phi": (leading + fsr_leading.sum()).phi, + "mass": (leading + fsr_leading.sum()).mass, + },with_name="PtEtaPhiMLorentzVector",) + + subleading_with_fsr = ak.zip({ + "jet1" : subleading, + "jet2" : fsr_subleading.sum(), + "pt": (subleading + fsr_subleading.sum()).pt, + "eta": (subleading + fsr_subleading.sum()).eta, + "phi": (subleading + fsr_subleading.sum()).phi, + "mass": (subleading + fsr_subleading.sum()).mass, + },with_name="PtEtaPhiMLorentzVector",) + + + # (Maybe) one could calculate the angle between FSR & the "main" jet they correspond to + # - this would be correlated with the mass of the decaying p. via the dead-cone effect, + # - could be a discriminating variable at the event level. + + # ================================================================================= + # + # # Build Higgs candidate w/ or w/o FSR + # + # --------------------------------------------------------------------------------- + + # Build 4-vector from leading + subleading jets, with or without FSR + higgs_cand_no_fsr = ak.zip({ + "jet1" : leading, + "jet2" : subleading, + "pt": (leading + subleading).pt, + "eta": (leading + subleading).eta, + "phi": (leading + subleading).phi, + "mass": (leading + subleading).mass, + },with_name="PtEtaPhiMLorentzVector",) + + higgs_cand = ak.zip({ + "jet1" : leading_with_fsr, + "jet2" : subleading_with_fsr, + "pt": (leading_with_fsr + subleading_with_fsr).pt, + "eta": (leading_with_fsr + subleading_with_fsr).eta, + "phi": (leading_with_fsr + subleading_with_fsr).phi, + "mass": (leading_with_fsr + subleading_with_fsr).mass, + },with_name="PtEtaPhiMLorentzVector",) + + + + # ================================================================================= + # + # # Actual event selection starts here + # + # --------------------------------------------------------------------------------- + + + # Common global requirements in the Zll channel + # - valid for 2LH and 2LL + # - valid for any region, no matter if SR or CR + + # leppair and ll_cand have different dim, leppair contains lists, + # ll_cand only numbers on innermost dim (because already reduced above) + # therefore when evaluating ak.any with axis=-1, + # ll_cand will ALWAYS be true (a.k.a. for every event), as long as one event fulfils the criterion + # for leppair, there needs to be one per event, as expected + # print((leppair.lep1.pt>20)) + # print((ll_cand.mass>75)) + # print((higgs_cand.mass<250)) + # print((njet>=2)) + # inside any one can then only place stuff that has one more dim + + # related to individual leptons + req_global = ak.any((leppair.lep1.pt>20) & (leppair.lep2.pt>20) \ + # opposite charge + & ((leppair.lep1.charge+leppair.lep2.charge)==0) \ + , axis=-1 + ) + # cands and global stuff + # note: V_pt > 60 as in AT, AN: 50 (don't confuse) + req_global = req_global \ + & (ll_cand.pt>60) \ + & (njet>=2) \ + & (higgs_cand.mass<250) + + + selection.add('global_selection',ak.to_numpy(req_global)) + + + mask2e = req_global & (nele == 2) + mask2mu = req_global & (nmu == 2) + + #mask2lep = [ak.any(tup) for tup in zip(maskemu, mask2mu, mask2e)] + mask2lep = [ak.any(tup) for tup in zip(mask2mu, mask2e)] + + good_leptons = ak.mask(good_leptons,mask2lep) + + + #output['cutflow'][dataset]['selected Z pairs'] += ak.sum(ak.num(good_leptons)>0) + + selection.add('ee',ak.to_numpy(nele == 2)) + selection.add('mumu',ak.to_numpy(nmu == 2)) + + + #print(higgs_cand.type) + #print(ll_cand.type) + + # global already contains Vpt>60 as the lower bound + # global also has higgs_cand.mass<250 + req_sr_Zll = (ll_cand.mass > 75) & (ll_cand.mass < 105) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4) + # flip H mass, otherwise same + req_cr_Zcc = (ll_cand.mass > 85) & (ll_cand.mass < 97) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & ~((higgs_cand.mass>=50) & (higgs_cand.mass<=200)) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4) + # Note: m_ll requirement not in AN, but in AT + req_cr_Z_LF = (ll_cand.mass > 75) & (ll_cand.mass < 105) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL<0.225) & (leading.btagDeepFlavCvB>0.4) + + req_cr_Z_HF = (ll_cand.mass > 85) & (ll_cand.mass < 97) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4) + + req_cr_t_tbar = ~((ll_cand.mass>0) & (ll_cand.mass<10)) & ~((ll_cand.mass>75) & (ll_cand.mass<120)) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4) + + req_sr_Zll_vpt_low = req_global & req_sr_Zll & (ll_cand.pt<150) + # print(ll_cand.pt<150) + # print(ak.any(ll_cand.pt<150, axis=-1) + # print(req_sr_Zll_vpt_low) + req_sr_Zll_vpt_high = req_global & req_sr_Zll & (ll_cand.pt>150) + # print(ll_cand.pt>150) + # print(req_sr_Zll_vpt_high) + # print(len(req_sr_Zll_vpt_low)) + # print(len(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high)) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low))) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high))) + + req_cr_Zcc_vpt_low = req_global & req_cr_Zcc & (ll_cand.pt<150) + # print(req_sr_Zll_vpt_low) + req_cr_Zcc_vpt_high = req_global & req_cr_Zcc & (ll_cand.pt>150) + # print(req_sr_Zll_vpt_high) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low & req_sr_Zll_vpt_high))) + + req_cr_Z_LF_vpt_low = req_global & req_cr_Z_LF & (ll_cand.pt<150) + req_cr_Z_LF_vpt_high = req_global & req_cr_Z_LF & (ll_cand.pt>150) + + req_cr_Z_HF_vpt_low = req_global & req_cr_Z_HF & (ll_cand.pt<150) + req_cr_Z_HF_vpt_high = req_global & req_cr_Z_HF & (ll_cand.pt>150) + + req_cr_t_tbar_vpt_low = req_global & req_cr_t_tbar & (ll_cand.pt<150) + req_cr_t_tbar_vpt_high = req_global & req_cr_t_tbar & (ll_cand.pt>150) + + + #prob not necessary + #selection.add('SR',ak.to_numpy(req_sr_Zll)) + + selection.add('SR_2LL',ak.to_numpy(req_sr_Zll_vpt_low)) + selection.add('SR_2LH',ak.to_numpy(req_sr_Zll_vpt_high)) + selection.add('CR_Zcc_2LL',ak.to_numpy(req_cr_Zcc_vpt_low)) + selection.add('CR_Zcc_2LH',ak.to_numpy(req_cr_Zcc_vpt_high)) + selection.add('CR_Z_LF_2LL',ak.to_numpy(req_cr_Z_LF_vpt_low)) + selection.add('CR_Z_LF_2LH',ak.to_numpy(req_cr_Z_LF_vpt_high)) + selection.add('CR_Z_HF_2LL',ak.to_numpy(req_cr_Z_HF_vpt_low)) + selection.add('CR_Z_HF_2LH',ak.to_numpy(req_cr_Z_HF_vpt_high)) + selection.add('CR_t_tbar_2LL',ak.to_numpy(req_cr_t_tbar_vpt_low)) + selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high)) + + + + + + # ================================================================================= + # + # # Calculate and store weights & factors + # + # --------------------------------------------------------------------------------- + + # there is also nProcEvents, which might be related to nEvents by some factor + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc + # there are some more calculations related to weights, e.g. + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc#L115-L154 + + # ToDo: + # [ ] LHEScaleWeight ?? + # [ ] intWeight - is this only relevant when running over the post-processed samples, or already on top of Nano+AK15? + # [x] genWeight + # [ ] PrefireWeight - (for 2016+2017) see also: + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2099-L2113 + # [ ] weight_PU + # [ ] weight_ptEWK + # [(x)] Lep_SF - but I'm not sure about EOY / UL compatibility + # [ ] recoWReWeight + # [ ] WJetNLOWeight + # [ ] cTagWeight - later, also including up/down syst + # [ ] weight_mettrigSF + # [ ] weight_puid - not the same as _PU + # [ ] weight_subptEWKnnlo - find out what "SubGen" is + # + # [ ] LOtoNLOWeightBjetSplitEtabb + # [ ] WPtCorrFactor + # [ ] ZPtCorrFactor + + + + + # running over more than just the Double[] datasets, but still requiring the same trigger + # not sure if correct + if 'DoubleEG' in dataset or 'Electron' in dataset: + output['cutflow'][dataset]['trigger'] += ak.sum(trigger_ee) + elif 'Muon' in dataset : + output['cutflow'][dataset]['trigger'] += ak.sum(trigger_mm) + + + # Successively add another cut w.r.t. previous line, looks a bit like N-1 histograms + output['cutflow'][dataset]['jet selection'] += ak.sum(njet>=2) + output['cutflow'][dataset]['global selection'] += ak.sum(req_global) + output['cutflow'][dataset]['signal region'] += ak.sum(req_global & req_sr_Zll) + output['cutflow'][dataset]['signal region & ee or mumu'] += ak.sum(req_global & req_sr_Zll & ( ((nele == 2) & trigger_ee) | ((nmu == 2) & trigger_mm))) + output['cutflow'][dataset]['signal ee'] += ak.sum(req_global & req_sr_Zll & (nele == 2) & trigger_ee) + output['cutflow'][dataset]['signal mumu'] += ak.sum(req_global & req_sr_Zll & (nmu == 2) & trigger_mm) + + + lepflav = ['ee','mumu'] + reg = ['SR_2LL','SR_2LH', + 'CR_Zcc_2LL','CR_Zcc_2LH', + 'CR_Z_LF_2LL','CR_Z_LF_2LH', + 'CR_Z_HF_2LL','CR_Z_HF_2LH', + 'CR_t_tbar_2LL','CR_t_tbar_2LH'] + + #print(possible_flavSplits) + list_weights = [] + lists_of_vars = {} + names = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_mass', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_phi_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + modes = ['low_ee', 'high_ee', 'low_mumu', 'high_mumu'] + for name in names: + for mode in modes: + lists_of_vars[f'{name}_{mode}'] = [] + ''' + lists_of_vars = {'wei': [], + 'Higgs_mass': [], + 'Higgs_pt': [], + 'Z_pt': [], + 'jjVptratio': [], + 'CvsL_max': [], + 'CvsL_min': [], + 'CvsB_max': [], + 'CvsB_min': [], + 'pt_lead': [], + 'pt_sublead': [], + 'del_phi_jjV': [], + 'del_R_jj': [], + 'del_eta_jj': [], + 'del_phi_ll': [], + 'del_eta_ll': [], + 'del_phi_l2_subleading': [], + 'del_phi_l2_leading': [] + } + ''' + #### write into histograms (i.e. write output) + for histname, h in output.items(): + for s in possible_flavSplits: + dataset_renamed = dataset if s == 'already_split_sample' else dataset + s + for ch in lepflav: + for r in reg: + cut = selection.all('lepsel', + 'jetsel', + 'global_selection', + 'metfilter', + 'lumi', + r, + ch, + s, + 'trigger_%s'%(ch)) + llcut = ll_cand[cut] + # this next line is necessary if running with multiple possible ll candidates + #llcut = llcut[:,0] + + lep1cut = llcut.lep1 + lep2cut = llcut.lep2 + #print(self._version) + if not isRealData and not self._debug: + #print('not data, not test') + if ch == 'ee': + lepsf = eleSFs(lep1cut, self._year, self._corr) * eleSFs(lep2cut, self._year, self._corr) + elif ch == 'mumu': + lepsf = muSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr) + ''' + # This would be emu channel, which does not exist in the VHcc Zll case + else: + lepsf = np.where(lep1cut.lep_flav == 11, + eleSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr), + 1.) \ + * np.where(lep1cut.lep_flav == 13, + eleSFs(lep2cut, self._year, self._corr) * muSFs(lep1cut, self._year, self._corr), + 1.) + ''' + else : + #lepsf = weights.weight()[cut] + # AS: if I understand correctly, this only works because in case of data, weights are identically 1 for every entry + # otherwise this would double count the weights in a later step (where lepsf gets multiplied by the weights!) + lepsf = ak.full_like(weights.weight()[cut], 1) + #print(lepsf) + # print(weights.weight()[cut]*lepsf) + # print(lepsf) + ''' + if self._export_array and not isRealData: + if ch == 'ee' and r == 'SR_2LL' and s == '_cc': + eell_cand = ak.zip({ + "Higgs_mass" : higgs_cand['mass'][cut] * lepsf, + #"jet2" : subleading_with_fsr, + #"pt": (leading_with_fsr + subleading_with_fsr).pt, + #"eta": (leading_with_fsr + subleading_with_fsr).eta, + #"phi": (leading_with_fsr + subleading_with_fsr).phi, + #"mass": (leading_with_fsr + subleading_with_fsr).mass, + }) + print(eell_cand) + ''' + if 'leading_jetflav_' in histname and 'sub' not in histname: + #print(dir(leading)) + #print(h.axes) + names = [ax.name for ax in h.axes] + fields = {l: normalize(leading[histname.replace('leading_jetflav_','')], + cut) for l in names if l in dir(leading)} + #print(fields) + #sys.exit() + if isRealData: + flavor = ak.zeros_like(normalize(leading['pt'],cut)) + else: + flavor = normalize(leading.hadronFlavour,cut) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + flav = flavor, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'subleading_jetflav_' in histname: + #print(dir(subleading)) + names = [ax.name for ax in h.axes] + fields = {l: normalize(subleading[histname.replace('subleading_jetflav_','')], + cut) for l in names if l in dir(subleading)} + if isRealData: + flavor = ak.zeros_like(normalize(subleading['pt'],cut)) + else: + flavor = normalize(subleading.hadronFlavour,cut) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + flav = flavor, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'lep1_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(lep1cut[histname.replace('lep1_','')]), + np.nan) for l in names if l in dir(lep1cut)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'lep2_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(lep2cut[histname.replace('lep2_','')]), + np.nan) for l in names if l in dir(lep2cut)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + #elif 'MET_' in histname: + # fields = {l: normalize(events.MET[histname.replace('MET_','')], + # cut) for l in names if l in dir(events.MET)} + # h.fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # **fields, + # weight = weights.weight()[cut] * lepsf) + elif 'll_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(llcut[histname.replace('ll_','')]), + np.nan) for l in names if l in dir(llcut)} + #print(max(llcut['pt'])) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'jj_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: normalize(higgs_cand[histname.replace('jj_','')], + cut) for l in names if l in dir(higgs_cand)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + else: + output['nj'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nj = normalize(ak.num(jet_conditions),cut), + weight = weights.weight()[cut]*lepsf) + # check? + output['nAddJets302p5_puid'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nAddJets302p5_puid = normalize(ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ), + cut), + weight = weights.weight()[cut]*lepsf) + # check? + output['nAddJetsFSRsub302p5_puid'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nAddJetsFSRsub302p5_puid = normalize(ak.where((ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ) + -ak.num((~jet_conditions) & (fsr_conditions))) > 0, + (ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ) + -ak.num((~jet_conditions) & (fsr_conditions))), + (ak.zeros_like(ak.num(jet_conditions)))), + cut), + weight = weights.weight()[cut]*lepsf) + #if not isRealData: + # output['weight_full'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # weight_full = weights.weight()[cut]*lepsf) + # output['genweight'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # genWeight = events.genWeight[cut]) + # output['sign_genweight'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # genWeight_by_abs = (events.genWeight/abs(events.genWeight))[cut]) + output['jjVPtRatio'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + jjVPtRatio = (normalize(higgs_cand['pt'], + cut) / ak.fill_none(flatten(llcut['pt']), + np.nan)), + weight = weights.weight()[cut] * lepsf) + if self._export_array and not isRealData: + import pandas as pd + #output['array'][dataset]['weight'] += processor.column_accumulator( + # ak.to_numpy(weights.weight()[cut] * lepsf) + # ) + + list_weights.append(ak.to_numpy(weights.weight()[cut] * lepsf)) + + roi = ['SR_2LL','SR_2LH'] + lepflav_chosen = ['ee','mumu'] + names_dict = {'wei': weights.weight()[cut] * lepsf, + 'Higgs_mass': higgs_cand['mass'][cut] * lepsf, + 'Higgs_pt': higgs_cand['pt'][cut] * lepsf, + 'Z_mass': ll_cand['mass'][cut] * lepsf, + 'Z_pt': ll_cand['pt'][cut] * lepsf, + 'jjVptratio': (higgs_cand['pt'][cut] * lepsf)/ (ll_cand['pt'][cut] * lepsf), + 'CvsL_max': leading_with_fsr['jet1']['btagDeepFlavCvL'][cut] * lepsf, + 'CvsL_min': subleading_with_fsr['jet1']['btagDeepFlavCvL'][cut] * lepsf, + 'CvsB_max': leading_with_fsr['jet1']['btagDeepFlavCvB'][cut] * lepsf, + 'CvsB_min': subleading_with_fsr['jet1']['btagDeepFlavCvB'][cut] * lepsf, + 'pt_lead': leading_with_fsr['jet1']['pt'][cut] * lepsf, + 'pt_sublead': subleading_with_fsr['jet1']['pt'][cut] * lepsf, + 'del_phi_jjV': np.abs((higgs_cand[cut] * lepsf).delta_phi((ll_cand[cut] * lepsf))), + 'del_R_jj': np.abs((higgs_cand['jet1'][cut] * lepsf).delta_r((higgs_cand['jet2'][cut] * lepsf))), + 'del_eta_jj': np.abs((higgs_cand['jet1']['eta'][cut] * lepsf) - ((higgs_cand['jet2']['eta'][cut] * lepsf))), + 'del_phi_jj': np.abs((higgs_cand['jet1'][cut] * lepsf).delta_phi((higgs_cand['jet2'][cut] * lepsf))), + 'del_phi_ll': np.abs((ll_cand['lep1'][cut] * lepsf).delta_phi((ll_cand['lep2'][cut] * lepsf))), + 'del_eta_ll': np.abs((ll_cand['lep1']['eta'][cut] * lepsf) - ((ll_cand['lep2']['eta'][cut] * lepsf))), + 'del_phi_l2_subleading': np.abs((ll_cand['lep2'][cut] * lepsf).delta_phi((higgs_cand['jet1'][cut] * lepsf))), + 'del_phi_l2_leading': np.abs((ll_cand['lep2'][cut] * lepsf).delta_phi((higgs_cand['jet2'][cut] * lepsf))) + } + if ch in lepflav_chosen and r in roi: + if ch == 'ee': + if r == 'SR_2LL': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_low_ee'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_low_ee'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif r == 'SR_2LH': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_high_ee'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_high_ee'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif ch == 'mumu': + if r == 'SR_2LL': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_low_mumu'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_low_mumu'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif r == 'SR_2LH': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_high_mumu'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_high_mumu'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + + + + ### + regression, kinfit ??? + list_weights = np.array([item for sublist in list_weights for item in sublist]) + print(list_weights) + #print(lists_of_vars) + for v_name in lists_of_vars.keys(): + lists_of_vars[v_name] = np.array([item for sublist in lists_of_vars[v_name] for item in sublist]) + print(lists_of_vars) + if 'ZH' in dataset: + ttyp = 'signal_03' + else: + ttyp = 'back_03' + folder_save = f'condor_{ttyp}' + if not os.path.exists(f"./{folder_save}"): + os.mkdir(f"./{folder_save}") + if not os.path.exists(f"./{folder_save}/{dataset}"): + os.mkdir(f"./{folder_save}/{dataset}") + if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"): + os.mkdir(f"./{folder_save}/{dataset}/{filename}") + try: + df_weights = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv') + except FileNotFoundError: + df_weights = pd.DataFrame([], columns = ['weights']) + df_wei = pd.DataFrame([], columns = ['weights']) + df_wei['weights'] = list_weights + df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True) + df_wei.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights.csv', sep=',', encoding='utf-8', index=False) + df_weights_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv', sep=',', encoding='utf-8', index=False) + try: + df_else_everything = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv') + except FileNotFoundError: + df_else_everything = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()]) + df_else = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()]) + #print(df_else) + for var in lists_of_vars.keys(): + df_else[var] = pd.Series(lists_of_vars[var]) + df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True) + + df_else.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights.csv', sep=',', encoding='utf-8', index=False) + df_else_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv', sep=',', encoding='utf-8', index=False) + + return {dataset: output} + + def postprocess(self, accumulator): + #print(accumulator) + return accumulator diff --git a/Zll_process_newHist_pandas_small_update_isolation.py b/Zll_process_newHist_pandas_small_update_isolation.py new file mode 100644 index 0000000..11de771 --- /dev/null +++ b/Zll_process_newHist_pandas_small_update_isolation.py @@ -0,0 +1,1687 @@ +import csv +from curses import meta +from dataclasses import dataclass +import gzip +import pickle, os, sys, mplhep as hep, numpy as np +from select import select + +import json + +#from coffea import hist, processor # ToDo: move to the better hist +from coffea import processor # ToDo: move to the better hist +import hist +from hist import Hist +from coffea.nanoevents.methods import vector +import awkward as ak +from VHcc.utils.correction import jec,muSFs,eleSFs,init_corr +from coffea.lumi_tools import LumiMask +from coffea.analysis_tools import Weights, PackedSelection +from functools import partial +# import numba +from VHcc.helpers.util import reduce_and, reduce_or, nano_mask_or, get_ht, normalize, make_p4 +import particle +from hepunits import GeV + +def empty_column_accumulator(): + #return processor.column_accumulator(np.array([],dtype=object)) + return processor.column_accumulator(np.array([],dtype=np.float64)) +def array_accumulator(): + return processor.defaultdict_accumulator(empty_column_accumulator) + +def mT(obj1,obj2): + return np.sqrt(2.*obj1.pt*obj2.pt*(1.-np.cos(obj1.phi-obj2.phi))) +def flatten(ar): # flatten awkward into a 1d array to hist + return ak.flatten(ar, axis=None) +def normalize(val, cut): + if cut is None: + ar = ak.to_numpy(ak.fill_none(val, np.nan)) + return ar + else: + ar = ak.to_numpy(ak.fill_none(val[cut], np.nan)) + return ar + +def read_json(path): + f = open(path) + data = json.load(f) + return data + +def dataset_name_to_number(dataset, year): + samples_path = 'src/VHcc/metadata/sample_info_' + year + '_reversed' + + samples = read_json(samples_path+'.json') + + return samples[dataset]['type'], samples[dataset]['doJetFlavorSplit'] + +def dataset_categories(year): + map_path = 'src/VHcc/metadata/mergemap_' + year + '_Zll' + + samples = read_json(map_path+'.json').values() + all_datasets = [item for sublist in samples for item in sublist] + + return all_datasets + +def get_info_dict(year): + with open(f'src/VHcc/metadata/sample_info_{year}.json') as si: + info = json.load(si) + info_dict={} + for obj in info: + #print(obj) + info_dict[obj]=info[obj]['name'] + return info_dict + +class NanoProcessor(processor.ProcessorABC): + def __init__(self, cfg): + self.cfg = cfg + self._year = self.cfg.dataset["year"] + self._campaign = self.cfg.dataset["campaign"] + + self._version=self.cfg.userconfig['version'] # only because the new runner etc. needs that, not used later + self._export_array = True # if 'test' in self._version else False + self._debug = False #True + + # paths from table 1 and 2 of the AN_2020_235 + + # l l + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3230-L3337 + self._mumu_hlt = { + '2016': [ + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + 'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL', + 'Mu17_TrkIsoVVL_TkMu8_TrkIsoVVL_DZ' + ], + '2017': [ + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1 + ], + '2018': [ + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL', + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ', + 'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass3p8',#allowMissingBranch=1 but this is the only used one in 2018?! + #'Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ_Mass8'#allowMissingBranch=1 + ], + } + + self._ee_hlt = { + '2016': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' + ], + '2017': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL', + #'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL_DZ' # not in VHccAnalysis code + ], + '2018': [ + 'Ele23_Ele12_CaloIdL_TrackIdL_IsoVL' + ], + } + + ''' + # l nu + self._munu_hlt = { + '2016': [ + 'IsoMu24', + 'IsoTkMu24' + ], + '2017': [ + 'IsoMu24', + 'IsoMu27' + ], + '2018': [ + 'IsoMu24', + 'IsoMu27' + ], + } + + self._enu_hlt = { + '2016': [ + 'Ele27_eta2p1_WPTight_Gsf' + ], + '2017': [ + 'Ele32_WPTight_Gsf_L1DoubleEG', + 'Ele32_WPTight_Gsf' + ], + '2018': [ + 'Ele32_WPTight_Gsf_L1DoubleEG', + 'Ele32_WPTight_Gsf'#allowMissingBranch=1 + ], + } + + # nu nu + self._nunu_hlt = { + '2016': [ + 'PFMET110_PFMHT110_IDTight', + #'PFMET110_PFMHT120_IDTight', # found in hltbranches_2016.txt but not in AN, maybe redundant? + 'PFMET170_NoiseCleaned',#allowMissingBranch=1 + 'PFMET170_BeamHaloCleaned',#allowMissingBranch=1 + 'PFMET170_HBHECleaned' + ], + '2017': [ + 'PFMET110_PFMHT110_IDTight', + 'PFMET120_PFMHT120_IDTight', + 'PFMET120_PFMHT120_IDTight_PFHT60',#allowMissingBranch=1 + 'PFMETTypeOne120_PFMHT120_IDTight' + ], + '2018': [ + 'PFMET110_PFMHT110_IDTight', + 'PFMET120_PFMHT120_IDTight', + 'PFMET120_PFMHT120_IDTight_PFHT60'#allowMissingBranch=1 + ], + } + + ''' + + # differences between UL and EOY + # see https://twiki.cern.ch/twiki/bin/view/CMS/MissingETOptionalFiltersRun2 + # also look at sec. 3.7.2 + self._met_filters = { + '2016': { + 'data': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + 'eeBadScFilter', + ], + 'mc': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'eeBadScFilter', # not suggested in EOY MC + ], + }, + '2017': { + "data": [ + "goodVertices", + "globalSuperTightHalo2016Filter", + "HBHENoiseFilter", + "HBHENoiseIsoFilter", + "EcalDeadCellTriggerPrimitiveFilter", + "BadPFMuonFilter", + "BadPFMuonDzFilter", + "hfNoisyHitsFilter", + "eeBadScFilter", + "ecalBadCalibFilter", + ], + "mc": [ + "goodVertices", + "globalSuperTightHalo2016Filter", + "HBHENoiseFilter", + "HBHENoiseIsoFilter", + "EcalDeadCellTriggerPrimitiveFilter", + "BadPFMuonFilter", + "BadPFMuonDzFilter", + "hfNoisyHitsFilter", + "eeBadScFilter", + "ecalBadCalibFilter", + ], + }, + '2018': { + 'data': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'hfNoisyHitsFilter', # not in EOY + 'eeBadScFilter', + 'ecalBadCalibFilterV2', + ], + 'mc': [ + 'goodVertices', + 'globalSuperTightHalo2016Filter', + 'HBHENoiseFilter', + 'HBHENoiseIsoFilter', + 'EcalDeadCellTriggerPrimitiveFilter', + 'BadPFMuonFilter', + #'BadPFMuonDzFilter', # not in EOY + #'hfNoisyHitsFilter', # not in EOY + #'eeBadScFilter', # not suggested in EOY MC + 'ecalBadCalibFilterV2', + ], + }, + } + + # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/crab/crab_all.py#L33-36 + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions16/13TeV/ReReco/Final/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt' + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions17/13TeV/ReReco/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt' + #'https://cms-service-dqmdc.web.cern.ch/CAF/certification/Collisions18/13TeV/ReReco/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt' + # downloaded. + ''' + self._lumiMasks = { + '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'), + '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON.txt'), + '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt') + } + ''' + self._lumiMasks = { + '2016': LumiMask('src/VHcc/data/Lumimask/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt'), + '2017': LumiMask('src/VHcc/data/Lumimask/Cert_294927-306462_13TeV_UL2017_Collisions17_GoldenJSON.txt'), + '2018': LumiMask('src/VHcc/data/Lumimask/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt') + } + + self._corr = init_corr(self._year) + + # Axes: Cat - what it is, a type of something, described with words + # Bin - how much of something, numerical things + # + # --> Some axes are already connected to specific objetcs, or to the event + # --> Others are "building-blocks" that can be reused multiple times + + list_of_datasets = dataset_categories(self._year) + #print(list_of_datasets) + #sys.exit() + # Define axes + # Should read axes from NanoAOD config / metadata + #dataset_axis = hist.Cat("dataset", "Primary dataset") + dataset_axis = hist.axis.StrCategory([], name="dataset", label="Primary dataset", growth=True) + # split V+jets sample & VZ signal, this is per event + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2276 + #datasetSplit_axis = hist.Cat("datasetSplit", "Dataset split by flav", list_of_datasets) + datasetSplit_axis = hist.axis.StrCategory(list_of_datasets, name="datasetSplit", label="Dataset split by flav") + + # use hadronFlavour, necessary when applying btag scale factors (that depend on flavour) + # this one will be done per jet, can have values 0, 4, 5 + #flav_axis = hist.Bin("flav", r"hadronFlavour",[0,1,4,5,6]) + flav_axis = hist.axis.Variable([0,1,4,5,6], name="flav", label="hadronFlavour") + + #lepflav_axis = hist.Cat("lepflav",['ee','mumu']) + lepflav_axis = hist.axis.StrCategory(['ee','mumu'], name="lepflav", label="Lepton flav") + + regions = ['SR_2LL','SR_2LH', + 'CR_Zcc_2LL','CR_Zcc_2LH', + 'CR_Z_LF_2LL','CR_Z_LF_2LH', + 'CR_Z_HF_2LL','CR_Z_HF_2LH', + 'CR_t_tbar_2LL','CR_t_tbar_2LH'] + #region_axis = hist.Cat("region",regions) + region_axis = hist.axis.StrCategory(regions, name="region", label="Region") + + # Events + njet_axis = hist.axis.Regular(13, -.5, 12.5, name="nj", label="N jets") #hist.Bin("nj", r"N jets", 13, -.5, 12.5) + + nAddJets_axis = hist.axis.Regular(11, -.5, 10.5, name="nAddJets302p5_puid", label="N additional jets") + #hist.Bin("nAddJets302p5_puid", r"N additional jets", 11, -.5, 10.5) + nAddJets_FSRsub_axis = hist.axis.Regular(11, -.5, 10.5, name="nAddJetsFSRsub302p5_puid", label="N additional jets (FSR subtracted)") + #hist.Bin("nAddJetsFSRsub302p5_puid", r"N additional jets (FSR subtracted)", 11, -.5, 10.5) + + #nbjet_axis = hist.Bin("nbj", r"N b jets", [0,1,2,3,4,5]) + #ncjet_axis = hist.Bin("ncj", r"N c jets", [0,1,2,3,4,5]) + # kinematic variables + pt_axis = hist.axis.Regular(50, 0, 300, name="pt", label=r"$p_{T}$ [GeV]") + #hist.Bin("pt", r" $p_{T}$ [GeV]", 50, 0, 300) + eta_axis = hist.axis.Regular(25, -2.5, 2.5, name="eta", label=r"$\eta$") + #hist.Bin("eta", r" $\eta$", 25, -2.5, 2.5) + phi_axis = hist.axis.Regular(30, -3, 3, name="phi", label=r"$\phi$") + #hist.Bin("phi", r" $\phi$", 30, -3, 3) + mass_axis = hist.axis.Regular(50, 0, 300, name="mass", label=r"$m$ [GeV]") + #hist.Bin("mass", r" $m$ [GeV]", 50, 0, 300) + mt_axis = hist.axis.Regular(30, 0, 300, name="mt", label=r"$m_{T}$ [GeV]") + #hist.Bin("mt", r" $m_{T}$ [GeV]", 30, 0, 300) + dr_axis = hist.axis.Regular(20, 0, 5, name="dr", label=r"$\Delta$R") + #hist.Bin("dr","$\Delta$R",20,0,5) + + # some more variables to check, which enter BDT + # need to revisit this later, because high Vpt and low Vpt can have different binning + jjVPtRatio_axis = hist.axis.Regular(15, 0, 2, name="jjVPtRatio", label=r"$p_{T}(jj) / $p_{T}(V)$ [GeV]") + #hist.Bin("jjVPtRatio",r"$p_{T}(jj) / $p_{T}(V)$ [GeV]",15,0,2) + + + #dphi_V_H_axis = hist.Bin("dphi_V_H","$\Delta\Phi(V, H)$",20,0,3.2) + # jet jet + #dr_j1_j2_axis = hist.Bin("dr_j1_j2","$\Delta R(j1,j2)$",20,0,5) + # jet jet + #dphi_j1_j2_axis = hist.Bin("dphi_j1_j2","$\Delta\Phi(j1,j2)$",15,-3.2,3.2) + #deta_j1_j2_axis = hist.Bin("deta_j1_j2","$\Delta\eta(j1,j2)$",15,0,3) + # lepton lepton + #dphi_l1_l2_axis = hist.Bin("dphi_l1_l2","$\Delta\Phi(l1,l2)$",15,0,3.2) + #deta_l1_l2_axis = hist.Bin("eta_l1_l2","$\Delta\eta(l1,l2)$",15,0,2.6) + # jet lepton + #dphi_j1_l1_axis = hist.Bin("dphi_j1_l1","$\Delta\Phi(j1,l1)$",15,0,3.2) + #dphi_j2_l1_axis = hist.Bin("dphi_j2_l1","$\Delta\Phi(j2,l1)$",15,0,3.2) + #dphi_j1_l2_axis = hist.Bin("dphi_j1_l2","$\Delta\Phi(j1,l2)$",15,0,3.2) + #dphi_j2_l2_axis = hist.Bin("dphi_j2_l2","$\Delta\Phi(j2,l2)$",15,0,3.2) + + # ToDo: several other variables can only be stored after kinfit + # e.g. here https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/python/kinfitter.py + # or https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L4670-L4995 + + # weights are interesting as well + weight_axis = hist.axis.Regular(100, -5.001, 5.001, name="weight_full", label="weight_full") + #hist.Bin("weight_full","weight_full",100,-5.001, 5.001) + genweight_axis = hist.axis.Regular(100, -0.001, 0.001, name="genWeight", label="genWeight") + #hist.Bin("genWeight","genWeight",100,-0.001, 0.001) + sign_genweight_axis = hist.axis.Regular(100, -1.001, 1.001, name="genWeight_by_abs", label="genWeight/abs(genWeight)") + #hist.Bin("genWeight_by_abs","genWeight/abs(genWeight)",100,-1.001,1.001) + + + # MET vars + #signi_axis = hist.Bin("significance", r"MET $\sigma$",20,0,10) + #covXX_axis = hist.Bin("covXX",r"MET covXX",20,0,10) + #covXY_axis = hist.Bin("covXY",r"MET covXY",20,0,10) + #covYY_axis = hist.Bin("covYY",r"MET covYY",20,0,10) + #sumEt_axis = hist.Bin("sumEt", r" MET sumEt", 50, 0, 300) + + # ToDo: switch to this + # axis.StrCategory([], name='region', growth=True), + #disc_list = [ 'btagDeepCvL', 'btagDeepCvB','btagDeepFlavCvB','btagDeepFlavCvL']#,'particleNetAK4_CvL','particleNetAK4_CvB'] + # As far as I can tell, we only need DeepFlav currently + ### In all of the older stuff use: + #disc_list = ['btagDeepFlavC','btagDeepFlavB','btagDeepFlavCvL','btagDeepFlavCvB'] + ### With new stuff UL I use, use: + disc_list = ['btagDeepFlavCvL','btagDeepFlavCvB'] + btag_axes = [] + for d in disc_list: + # technically, -1 values are possible, but probably unlikely to matter much after event selection + btag_axes.append(hist.axis.Regular(20, 0, 1, name=d, label=d) + #hist.Bin(d, d , 20, 0, 1) + ) + #h = ( + # Hist.new.Reg(10, -5, 5, overflow=False, underflow=False, name="A") + # .Bool(name="B") + # .Var(range(10), name="C") + # .Int(-5, 5, overflow=False, underflow=False, name="D") + # .IntCat(range(10), name="E") + # .StrCat(["T", "F"], name="F") + # .Double() + #) + #print(type(dataset_axis)) + #print(type(lepflav_axis)) + #print(type(flav_axis)) + #print(type(njet_axis)) + #print(type(hist.storage.Weight())) + #testHistA = Hist(dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, njet_axis, hist.storage.Weight()) + #testHist = Hist( + # #dataset_axis, + # #datasetSplit_axis, + # #lepflav_axis, + # #region_axis, + # njet_axis, + # hist.storage.Weight() + # ) + _hist_event_dict = { + 'nj' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + njet_axis, hist.storage.Weight()), + 'nAddJets302p5_puid' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + nAddJets_axis, hist.storage.Weight()), + 'nAddJetsFSRsub302p5_puid' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + nAddJets_FSRsub_axis, hist.storage.Weight()), + # 'weight_full' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # weight_axis, hist.storage.Weight()), + # 'genweight' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # genweight_axis, hist.storage.Weight()), + # 'sign_genweight' : Hist(datasetSplit_axis, + # lepflav_axis, + # region_axis, + # sign_genweight_axis, hist.storage.Weight()), + 'jjVPtRatio' : Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + jjVPtRatio_axis, hist.storage.Weight()) + + #'dphi_V_H' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dphi_V_H_axis) + #'dr_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, ,dr_j1_j2_axis) + #'dphi_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_j2_axis) + #'deta_j1_j2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, deta_j1_j2_axis) + #'dphi_l1_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_l1_l2_axis) + #'dphi_j1_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j1_l2_axis) + #'dphi_j2_l2' : Hist( dataset_axis, datasetSplit_axis, lepflav_axis, region_axis, dphi_j2_l2_axis) + + #'sampleFlavSplit' : Hist( dataset_axis, lepflav_axis, region_axis, sampleFlavSplit_axis), + #'nbj' : Hist( dataset_axis, lepflav_axis, region_axis, nbjet_axis), + #'ncj' : Hist( dataset_axis, lepflav_axis, region_axis, ncjet_axis), + #'hj_dr' : Hist( dataset_axis, lepflav_axis, region_axis, dr_axis), + #'MET_sumEt' : Hist( dataset_axis, lepflav_axis, region_axis, sumEt_axis), + #'MET_significance' : Hist( dataset_axis, lepflav_axis, region_axis, signi_axis), + #'MET_covXX' : Hist( dataset_axis, lepflav_axis, region_axis, covXX_axis), + #'MET_covXY' : Hist( dataset_axis, lepflav_axis, region_axis, covXY_axis), + #'MET_covYY' : Hist( dataset_axis, lepflav_axis, region_axis, covYY_axis), + #'MET_phi' : Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'MET_pt' : Hist( dataset_axis, lepflav_axis, region_axis, pt_axis), + #'mT1' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'mT2' : Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'mTh':Hist( dataset_axis, lepflav_axis, region_axis, mt_axis), + #'dphi_lep1':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'dphi_lep2':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + #'dphi_ll':Hist( dataset_axis, lepflav_axis, region_axis, phi_axis), + } + + # jets will be ordered by DeepJet (which is DeepFlav for historical reasons) + objects=['leading_jetflav','subleading_jetflav','lep1','lep2','ll','jj'] + + for i in objects: + # distinguish between jets and other objects, as the structure for jets contains additional flavour axis + if 'jet' in i: + _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + pt_axis, hist.storage.Weight()) + _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + eta_axis, hist.storage.Weight()) + _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + phi_axis, hist.storage.Weight()) + _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + mass_axis, hist.storage.Weight()) + else: + _hist_event_dict["%s_pt" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + pt_axis, hist.storage.Weight()) + _hist_event_dict["%s_eta" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + eta_axis, hist.storage.Weight()) + _hist_event_dict["%s_phi" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + phi_axis, hist.storage.Weight()) + _hist_event_dict["%s_mass" %(i)]=Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + mass_axis, hist.storage.Weight()) + + # more information on the discriminators is stored for the first two jets, + # ordered by DeepJet CvL discriminator and called "leading" and "subleading" + for disc, axis in zip(disc_list,btag_axes): + _hist_event_dict["leading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + axis, hist.storage.Weight()) + _hist_event_dict["subleading_jetflav_%s" %(disc)] = Hist(datasetSplit_axis, + lepflav_axis, + region_axis, + flav_axis, + axis, hist.storage.Weight()) + + self.event_hists = list(_hist_event_dict.keys()) + + # this can be used to not only store histograms, but also features on a per-event basis (arrays) + if self._export_array: + _hist_event_dict['array'] = processor.defaultdict_accumulator(array_accumulator) + #self._accumulator = processor.dict_accumulator( + # {**_hist_event_dict, + # #'cutflow': processor.defaultdict_accumulator( + # # partial(processor.defaultdict_accumulator, int)) + # }) + #self._accumulator['sumw'] = processor.defaultdict_accumulator(float) + + #self._accumulator = processor.dict_accumulator( + # { + # observable: Hist.Hist(var_axis, name="Counts", storage="Weight") + # for observable, var_axis in axis.items() + # if observable != "dataset" + # } + #) + #self._accumulator["cutflow"] = processor.defaultdict_accumulator( + # partial(processor.defaultdict_accumulator, int) + #) + #self._accumulator["sumw"] = 0 + + self.make_output = lambda: { + "cutflow": processor.defaultdict_accumulator( + partial(processor.defaultdict_accumulator, int) + ), + "sumw": 0, + **_hist_event_dict + } + + + @property + def accumulator(self): + return self._accumulator + + def process(self, events): + #output = self.accumulator #.identity() + output = self.make_output() + dataset = events.metadata['dataset'] + start = events.metadata['entrystart'] + stop = events.metadata['entrystop'] + output_location_list = [] + filename = events.metadata['filename'].split('/')[-1].strip('.root') + #print(dataset) + # Q: could there be MC that does not have this attribute? Or is it always the case? + isRealData = not hasattr(events, "genWeight") + + # Done (externally): map from the lengthy dataset (path) to a more readable name + # Keep the long name only for data, because it contains the Run info (necessary to apply corrections) + if isRealData: + info_dict = get_info_dict(self._year) + dataset_long = dataset + dictname = dataset[1:].split('/')[0] + dataset = info_dict[dictname] + print(dataset) + sample_type, doFlavSplit = dataset_name_to_number(dataset, self._year) + # length of events is used so many times later on, probably useful to just save it here and then refer to that + nEvents = len(events) + print('Number of events: ', nEvents) + if 'ZH' in dataset: + ttyp = 'signal_04_mid' + else: + ttyp = 'back_04_mid' + folder_save = f'condor_{ttyp}' + if not os.path.exists(f"./{folder_save}"): + os.mkdir(f"./{folder_save}") + if not os.path.exists(f"./{folder_save}/{dataset}"): + os.mkdir(f"./{folder_save}/{dataset}") + if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"): + os.mkdir(f"./{folder_save}/{dataset}/{filename}") + with open(f"./{folder_save}/event_nr.txt", "a") as myfile: + myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + " " + '\n') + + # As far as I understand, this looks like a neat way to give selections a name, + # while internally, there are boolean arrays for all events + selection = PackedSelection() + + + # this is either counting events in data with weight 1, or weighted (MC) + if isRealData: + output['sumw'] += nEvents + else: + # instead of taking the weights themselves, the sign is used: + # https://cms-talk.web.cern.ch/t/huge-event-weights-in-dy-powhegminnlo/8718/7 + # although I initially had the same concerns as those raised in the thread, + # if not only the sign is different, but also the absolute values between events + # somehow it seems to average out, although I don't see why this is guaranteed + # must have to do with "LO without interference" where the values are indeed same + # and if they are not same, the differences are consired to be negligible + output['sumw'] += ak.sum(events.genWeight/abs(events.genWeight)) + + + req_lumi=np.ones(nEvents, dtype='bool') + if isRealData: + req_lumi=self._lumiMasks[self._year](events.run, events.luminosityBlock) + selection.add('lumi',ak.to_numpy(req_lumi)) + del req_lumi + + + # AS: sort of the same thing as above, but now per entry + weights = Weights(nEvents, storeIndividual=True) + if isRealData: + weights.add('genweight',np.ones(nEvents)) + else: + weights.add('genweight',events.genWeight/abs(events.genWeight)) + # weights.add('puweight', compiled['2017_pileupweight'](events.Pileup.nPU)) + + + ############## + if isRealData: + output['cutflow'][dataset]['all'] += nEvents + output['cutflow'][dataset]['all (weight 1)'] += nEvents + else: + output['cutflow'][dataset]['all'] += ak.sum(events.genWeight/abs(events.genWeight)) + output['cutflow'][dataset]['all (weight 1)'] += nEvents + + + #trigger_met = np.zeros(nEvents, dtype='bool') + + trigger_ee = np.zeros(nEvents, dtype='bool') + trigger_mm = np.zeros(nEvents, dtype='bool') + + #trigger_e = np.zeros(nEvents, dtype='bool') + #trigger_m = np.zeros(nEvents, dtype='bool') + + #for t in self._nunu_hlt[self._year]: + # # so that already seems to be the check for whether the path exists in the file or not + # if t in events.HLT.fields: + # trigger_met = trigger_met | events.HLT[t] + + for t in self._mumu_hlt[self._year]: + if t in events.HLT.fields: + trigger_mm = trigger_mm | events.HLT[t] + + for t in self._ee_hlt[self._year]: + if t in events.HLT.fields: + trigger_ee = trigger_ee | events.HLT[t] + + #for t in self._munu_hlt[self._year]: + # if t in events.HLT.fields: + # trigger_m = trigger_m | events.HLT[t] + + #for t in self._emu_hlt[self._year]: + # if t in events.HLT.fields: + # trigger_e = trigger_e | events.HLT[t] + + + selection.add('trigger_ee', ak.to_numpy(trigger_ee)) + selection.add('trigger_mumu', ak.to_numpy(trigger_mm)) + + + # apart from the comments above about EOY/UL, should be fine + metfilter = np.ones(nEvents, dtype='bool') + for flag in self._met_filters[self._year]['data' if isRealData else 'mc']: + metfilter &= np.array(events.Flag[flag]) + selection.add('metfilter', metfilter) + del metfilter + + + + # Not strictly necessary for Zll + met = ak.zip({ + "pt": events.MET.pt, + "phi": events.MET.phi, + "energy": events.MET.sumEt, + }, with_name="PtEtaPhiMLorentzVector" + ) + + + + split_by_flav = False + sampleFlavSplit = np.zeros(nEvents) + possible_flavSplits = ['already_split_sample'] + selection.add('already_split_sample',sampleFlavSplit == 0) + if not isRealData and not self._debug: + if doFlavSplit == '1' and not (int(sample_type) >= 27 and int(sample_type) <= 39): + split_by_flav = True + # uses the same naming scheme as AT, although udbsg is counterintuitive (b? [sic!]) + possible_flavSplits = ['_cc','_bb','_bc','_cl','_bl','_udbsg'] + # ================================================================================= + # + # # Split V+jets BG by flavour, via GenJet + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2184-L2228 + gen_jet = events.GenJet + + cGenJetTot = ak.sum((gen_jet.hadronFlavour == 4) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1) + bGenJetTot = ak.sum((gen_jet.hadronFlavour == 5) & (gen_jet.pt > 20) & (abs(gen_jet.eta) < 2.4), axis=1) + + tag_cc = cGenJetTot >= 2 + tag_bb = bGenJetTot >= 2 + tag_bc = (bGenJetTot == 1) & (cGenJetTot == 1) + tag_cl = (cGenJetTot == 1) & (bGenJetTot == 0) + tag_bl = (bGenJetTot == 1) & (cGenJetTot == 0) + tag_ll = (cGenJetTot == 0) & (bGenJetTot == 0) + + sampleFlavSplit = 1 * tag_cc + 2 * tag_bb + 3 * tag_bc + 4 * tag_cl + 5 * tag_bl + 6 * tag_ll + selection.add('_cc',sampleFlavSplit == 1) + selection.add('_bb',sampleFlavSplit == 2) + selection.add('_bc',sampleFlavSplit == 3) + selection.add('_cl',sampleFlavSplit == 4) + selection.add('_bl',sampleFlavSplit == 5) + selection.add('_udbsg',sampleFlavSplit == 6) # tbf I don't know why it contains b + + #elif dataset in ['WZTo1L1Nu2Q', 'ZZTo2L2Q', 'ZZTo2Q2Nu']: # VZ signal datasets + elif int(sample_type) in [32,36,37]: # VZ signal datasets + split_by_flav = True + possible_flavSplits = ['cc','bb','ll'] + # ================================================================================= + # + # # Split VZ signal by flavour, via GenPart + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2229-L2264 + gen_part = events.GenPart + + + Z_decay_mothers_A = (abs(gen_part.pdgId) == 23) & (gen_part.hasFlags('isLastCopy')) + + Z_decays = gen_part[Z_decay_mothers_A] + output['cutflow'][dataset]['GenPart VZ signal'] += ak.sum(Z_decay_mothers_A) + + n_b_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 5, axis=-1), axis=-1) + n_c_from_Z = ak.sum(ak.sum(abs(Z_decays.children.pdgId) == 4, axis=-1), axis=-1) + + + + VZ_cc = (n_c_from_Z >= 2) + VZ_bb = (n_b_from_Z >= 2) + VZ_others = (~VZ_cc) & (~VZ_bb) + # 1, 2 and 3 identical to what was done in AnalysisTools! Do not confuse with BTV / hadron / parton flavour... + sampleFlavSplit = 1 * VZ_cc + 2 * VZ_bb + 3 * VZ_others + + #print(sampleFlavSplit.type) + + selection.add('cc',sampleFlavSplit == 1) + selection.add('bb',sampleFlavSplit == 2) + selection.add('ll',sampleFlavSplit == 3) + + elif int(sample_type) in [27,28,29,30,31,33,34,35,38,39]: + possible_flavSplits = ['ll'] + sampleFlavSplit = sampleFlavSplit + 3 + selection.add('ll',sampleFlavSplit == 3) + split_by_flav = True + + # this is how it looked in AT for comparison: + ''' + else if( cursample->doJetFlavorSplit + && ( mInt("sampleIndex")==27 || mInt("sampleIndex")==28 + || mInt("sampleIndex")==29 || mInt("sampleIndex")==30 + || mInt("sampleIndex")==31 || mInt("sampleIndex")==33 + || mInt("sampleIndex")==34 || mInt("sampleIndex")==35 + || mInt("sampleIndex")==38 || mInt("sampleIndex")==39 + ) + ){ + *in["sampleIndex"] = mInt("sampleIndex")*100 + 3; + ''' + + + + + + # ================================================================================= + # + # # Reconstruct and preselect leptons + # + # --------------------------------------------------------------------------------- + + + # Adopt from https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440 + # https://gitlab.cern.ch/aachen-3a/vhcc-nano/-/blob/master/VHccProducer.py#L345-389 + + # ## Muon cuts + ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 + #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)] + event_mu = events.Muon + # looseId >= 1 or looseId seems to be the same... + musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25)) #(event_mu.looseId >= 1) (event_mu.mvaId >= 3) + # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L + event_mu = event_mu[musel] + event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)] + event_mu["lep_flav"] = 13*event_mu.charge + event_mu= ak.pad_none(event_mu,2,axis=1) + nmu = ak.sum(musel,axis=1) + # ToDo: PtCorrGeoFit + + # ## Electron cuts + ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2 + #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)] + event_e = events.Electron + elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25)) + # but 30GeV and WP80 for 1L + event_e = event_e[elesel] + # something I saw in a recent presentation, and also in AT code: + # https://indico.desy.de/event/34473/contributions/122201/attachments/76587/98753/RTG_Meeting_01_09_22.pdf + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/VHccAnalysis/PlotWithVarial/ZllHccLowPt.py#L256-L257 + # is to require "good electrons", which means excluding some region (eta), + # I guess it has sth to do with transition between barrel / endcap? + event_e = event_e[(abs(event_e.eta) > 1.5660) | (abs(event_e.eta) < 1.4442)] + event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)] + event_e["lep_flav"] = 11*event_e.charge + event_e = ak.pad_none(event_e,2,axis=1) + nele = ak.sum(elesel,axis=1) + # sorting after selecting should be faster (less computations on average) + + # for this channel (Zll / 2L) + selection.add('lepsel',ak.to_numpy((nele==2)|(nmu==2))) + + print(event_e) + print("Elecs") + print(event_mu) + print("Mus") + print(ak.concatenate([ event_e, event_mu], axis=1)) + print("Concat") + #### build lepton pair(s) + good_leptons = ak.with_name( + ak.concatenate([ event_e, event_mu], axis=1), + "PtEtaPhiMCandidate", ) + good_leptons = good_leptons[ak.argsort(good_leptons.pt, axis=1,ascending=False)] + leppair = ak.combinations( + good_leptons, + n=2, + replacement=False, + axis=-1, + fields=["lep1", "lep2"], + ) + #charged_constr = ((leppair.lep1['lep_flav'] + leppair.lep2['lep_flav']) == 0 ) + #leppair = leppair[charged_constr] + ll_cand = ak.zip({ + "lep1" : leppair.lep1, + "lep2" : leppair.lep2, + "pt": (leppair.lep1+leppair.lep2).pt, + "eta": (leppair.lep1+leppair.lep2).eta, + "phi": (leppair.lep1+leppair.lep2).phi, + "mass": (leppair.lep1+leppair.lep2).mass, + }, with_name="PtEtaPhiMLorentzVector" + ) + # probably there needs to be a cross-check that we don't include more than we want here, + # I know there is the option to truncate the array if more than 1 is found + # --> clip = True + ll_cand = ak.pad_none(ll_cand,1,axis=1) + + print(ll_cand) + + # there seem to be multiple ways to get the "one" ll_cand of interest + # - closest to Z-mass [makes sense] + # I think others use this + # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases] + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440 + ZMASS = particle.Particle.findall("Z0")[0].mass / GeV + + if (ak.count(ll_cand.pt)>0): + ll_cand = ll_cand[ak.argsort(ll_cand.pt, axis=1,ascending=False)] + #if (ak.count(ll_cand.pt)>0): + # ll_cand = ll_cand[ak.argsort(np.abs(ll_cand.mass-ZMASS), axis=1,ascending=True)] + # try the second option here + # NOTE: Comment out to debug stuff + ll_cand = ll_cand[:, 0] + + print(ll_cand) + print() + + # ================================================================================= + # + # # Reconstruct and preselect leptons gen level + # + # --------------------------------------------------------------------------------- + + # ## Muon cuts + generator = events.GenPart + #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)] + + event_mu_gen = generator[np.abs(generator.pdgId) == 13] + # looseId >= 1 or looseId seems to be the same... + musel_gen = ((event_mu_gen.pt > 20) & (abs(event_mu_gen.eta) < 2.4) & (event_mu_gen.status == 1)) + event_mu_gen = event_mu_gen[musel_gen] + event_mu_gen = event_mu_gen[ak.argsort(event_mu_gen.pt, axis=1, ascending=False)] + event_mu_gen["lep_flav"] = event_mu_gen.pdgId + event_mu_gen["charge"] = event_mu_gen.pdgId/13 + event_mu_gen= ak.pad_none(event_mu_gen,2,axis=1) + nmu_gen = ak.sum(musel_gen,axis=1) + # ToDo: PtCorrGeoFit + + # ## Electron cuts + ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2 + #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)] + event_e_gen = generator[np.abs(generator.pdgId) == 11] + elesel_gen = ((event_e_gen.pt > 20) & (abs(event_e_gen.eta) < 2.4) & (event_e_gen.status == 1)) + # but 30GeV and WP80 for 1L + event_e_gen = event_e_gen[elesel_gen] + event_e_gen = event_e_gen[(abs(event_e_gen.eta) > 1.5660) | (abs(event_e_gen.eta) < 1.4442)] + event_e_gen = event_e_gen[ak.argsort(event_e_gen.pt, axis=1,ascending=False)] + event_e_gen["lep_flav"] = event_e_gen.pdgId + event_e_gen["charge"] = event_e_gen.pdgId/11 + event_e_gen = ak.pad_none(event_e_gen,2,axis=1) + nele_gen = ak.sum(elesel_gen,axis=1) + # sorting after selecting should be faster (less computations on average) + + # for this channel (Zll / 2L) + selection.add('lepsel_gen',ak.to_numpy((nele_gen==2)|(nmu_gen==2))) + + print(event_e_gen) + print("Elecs") + print(event_mu_gen) + print("Mus") + print(ak.concatenate([ event_e_gen, event_mu_gen], axis=1)) + print("Concat") + #### build lepton pair(s) + good_leptons_gen = ak.with_name( + ak.concatenate([ event_e_gen, event_mu_gen], axis=1), + "PtEtaPhiMCandidate", ) + good_leptons_gen = good_leptons_gen[ak.argsort(good_leptons_gen.pt, axis=1,ascending=False)] + leppair_gen = ak.combinations( + good_leptons_gen, + n=2, + replacement=False, + axis=1, + fields=["lep1", "lep2"], + ) + #charged_constr = ((leppair.lep1['lep_flav'] + leppair.lep2['lep_flav']) == 0 ) + #leppair = leppair[charged_constr] + ll_cand_gen = ak.zip({ + "lep1" : leppair_gen.lep1, + "lep2" : leppair_gen.lep2, + "pt": (leppair_gen.lep1+leppair_gen.lep2).pt, + "eta": (leppair_gen.lep1+leppair_gen.lep2).eta, + "phi": (leppair_gen.lep1+leppair_gen.lep2).phi, + "mass": (leppair_gen.lep1+leppair_gen.lep2).mass, + }, with_name="PtEtaPhiMLorentzVector" + ) + # probably there needs to be a cross-check that we don't include more than we want here, + # I know there is the option to truncate the array if more than 1 is found + # --> clip = True + ll_cand_gen = ak.pad_none(ll_cand_gen,1,axis=1) + + print(ll_cand_gen) + + # there seem to be multiple ways to get the "one" ll_cand of interest + # - closest to Z-mass [makes sense] + # I think others use this + # - lepton-pair with highest pt [also, maybe it's even the same in the majority of the cases] + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L3369-L3440 + ZMASS = particle.Particle.findall("Z0")[0].mass / GeV + + if (ak.count(ll_cand_gen.pt)>0): + ll_cand_gen = ll_cand_gen[ak.argsort(ll_cand_gen.pt, axis=1,ascending=False)] + #if (ak.count(ll_cand.pt)>0): + # ll_cand = ll_cand[ak.argsort(np.abs(ll_cand.mass-ZMASS), axis=1,ascending=True)] + # try the second option here + # NOTE: Comment out to debug stuff + ll_cand_gen = ll_cand_gen[:, 0] + + print(ll_cand_gen) + print() + + # ================================================================================= + # + # # Reconstruct and preselect jets + # + # --------------------------------------------------------------------------------- + + # Apply correction: + if isRealData: + #print(dataset_long) + jets = jec(events,events.Jet,dataset_long,self._year,self._corr) + else: + jets = jec(events,events.Jet,dataset,self._year,self._corr) + #jets = events.Jet + + # This was necessary for the FSR code + #jets = jets.mask[ak.num(jets) > 2] + + + + # For EOY: recalculate CvL & CvB here, because the branch does not exist in older files + # adapted from PostProcessor + def deepflavcvsltag(jet): + btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB) + return ak.where((jet.btagDeepFlavB >= 0.) & (jet.btagDeepFlavB < 1.) & (jet.btagDeepFlavC >= 0.) & (btagDeepFlavL >= 0.), + jet.btagDeepFlavC/(1.-jet.btagDeepFlavB), + (-1.) * ak.ones_like(jet.btagDeepFlavB)) + + def deepflavcvsbtag(jet): + btagDeepFlavL = 1.-(jet.btagDeepFlavC+jet.btagDeepFlavB) + return ak.where((jet.btagDeepFlavB > 0.) & (jet.btagDeepFlavC > 0.) & (btagDeepFlavL >= 0.), + jet.btagDeepFlavC/(jet.btagDeepFlavC+jet.btagDeepFlavB), + (-1.) * ak.ones_like(jet.btagDeepFlavB)) + + # Alternative ways: + # - depending on the Nano version, there might already be bTagDeepFlavCvL available + # - one could instead use DeepCSV via bTagDeepCvL + # - not necessarily use CvL, other combination possible ( CvB | pt | BDT? ) + + #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets) + #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets) + jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)] + + + # Jets are considered only if the following identification conditions hold, as mentioned in AN + # - Here is some documentation related to puId and jetId: + # https://twiki.cern.ch/twiki/bin/viewauth/CMS/PileupJetID + # https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID + jet_conditions = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ + | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2) + # Count how many jets exist that pass this selection + njet = ak.sum(jet_conditions,axis=1) + selection.add('jetsel',ak.to_numpy(njet>=2)) + + + # ================================================================================= + # + # # FSR recovery + # + # --------------------------------------------------------------------------------- + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L841-L956 + + # FSR jets are selected with slightly different criteria + fsr_conditions = (abs(jets.eta) < 3) & (jets.pt > 20) \ + & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2) + # Take the first two jets that pass the criteria and check the remaining ones, + # as well as potentially others, to get FSR jets: + pick2 = jets[ak.pad_none(ak.local_index(jets, 1)[jet_conditions], 2)[:, :2]] + others = jets[ak.concatenate([ak.pad_none(ak.local_index(jets, 1)[(jet_conditions) & (fsr_conditions)], 2)[:, 2:], + ak.local_index(jets, 1)[(~jet_conditions) & (fsr_conditions)] + ], axis=1)] + + + def find_fsr(leading, subleading, others, threshold=0.8): + mval1, (a1, b) = leading.metric_table(others, return_combinations=True) + mval2, (a2, b) = subleading.metric_table(others, return_combinations=True) + + def res(mval, out): + order = ak.argsort(mval, axis=-1) + return out[order], mval[order] + + out1, metric1 = res(mval1, b) + out2, metric2 = res(mval2, b) + + out1 = out1.mask[(metric1 <= threshold) & (metric1 < metric2)] + out2 = out2.mask[(metric2 <= threshold) & (metric2 < metric1)] + #out2 = out2.mask[(metric1 <= threshold) & (metric2 < metric1)] + return out1[:, 0, ...], out2[:, 0, ...] + + + missing = ~(ak.is_none(pick2[:, 0]) | ak.is_none(pick2[:, 1])) + pick2 = pick2.mask[missing] + others = others.mask[missing] + + + leading, subleading = pick2[:, 0], pick2[:, 1] + fsr_leading, fsr_subleading = find_fsr(leading, subleading, others, threshold=0.8) + + #print(leading.pt) + #print((leading + fsr_leading.sum()).pt) + + # To explicitly check that adding FSR does indeed have an effect + #print(ak.sum((leading + fsr_leading.sum()).pt != leading.pt)) + + #print(leading.type) + + # Collect the (sub-)leading jets and their respective FSR jets in a new 4-vector + leading_with_fsr = ak.zip({ + "jet1" : leading, + "jet2" : fsr_leading.sum(), + "pt": (leading + fsr_leading.sum()).pt, + "eta": (leading + fsr_leading.sum()).eta, + "phi": (leading + fsr_leading.sum()).phi, + "mass": (leading + fsr_leading.sum()).mass, + },with_name="PtEtaPhiMLorentzVector",) + + subleading_with_fsr = ak.zip({ + "jet1" : subleading, + "jet2" : fsr_subleading.sum(), + "pt": (subleading + fsr_subleading.sum()).pt, + "eta": (subleading + fsr_subleading.sum()).eta, + "phi": (subleading + fsr_subleading.sum()).phi, + "mass": (subleading + fsr_subleading.sum()).mass, + },with_name="PtEtaPhiMLorentzVector",) + + + # (Maybe) one could calculate the angle between FSR & the "main" jet they correspond to + # - this would be correlated with the mass of the decaying p. via the dead-cone effect, + # - could be a discriminating variable at the event level. + + # ================================================================================= + # + # # Build Higgs candidate w/ or w/o FSR + # + # --------------------------------------------------------------------------------- + + # Build 4-vector from leading + subleading jets, with or without FSR + higgs_cand_no_fsr = ak.zip({ + "jet1" : leading, + "jet2" : subleading, + "pt": (leading + subleading).pt, + "eta": (leading + subleading).eta, + "phi": (leading + subleading).phi, + "mass": (leading + subleading).mass, + },with_name="PtEtaPhiMLorentzVector",) + + higgs_cand = ak.zip({ + "jet1" : leading_with_fsr, + "jet2" : subleading_with_fsr, + "pt": (leading_with_fsr + subleading_with_fsr).pt, + "eta": (leading_with_fsr + subleading_with_fsr).eta, + "phi": (leading_with_fsr + subleading_with_fsr).phi, + "mass": (leading_with_fsr + subleading_with_fsr).mass, + },with_name="PtEtaPhiMLorentzVector",) + + + + # ================================================================================= + # + # # Actual event selection starts here + # + # --------------------------------------------------------------------------------- + + + # Common global requirements in the Zll channel + # - valid for 2LH and 2LL + # - valid for any region, no matter if SR or CR + + # leppair and ll_cand have different dim, leppair contains lists, + # ll_cand only numbers on innermost dim (because already reduced above) + # therefore when evaluating ak.any with axis=-1, + # ll_cand will ALWAYS be true (a.k.a. for every event), as long as one event fulfils the criterion + # for leppair, there needs to be one per event, as expected + # print((leppair.lep1.pt>20)) + # print((ll_cand.mass>75)) + # print((higgs_cand.mass<250)) + # print((njet>=2)) + # inside any one can then only place stuff that has one more dim + + # related to individual leptons + req_global = ak.any((leppair.lep1.pt>20) & (leppair.lep2.pt>20) \ + # opposite charge + & ((leppair.lep1.charge+leppair.lep2.charge)==0) \ + , axis=-1 + ) + # cands and global stuff + # note: V_pt > 60 as in AT, AN: 50 (don't confuse) + req_global = req_global \ + & (ll_cand.pt>60) \ + & (njet>=2) \ + & (higgs_cand.mass<250) + + + selection.add('global_selection',ak.to_numpy(req_global)) + + + mask2e = req_global & (nele == 2) + mask2mu = req_global & (nmu == 2) + + #mask2lep = [ak.any(tup) for tup in zip(maskemu, mask2mu, mask2e)] + mask2lep = [ak.any(tup) for tup in zip(mask2mu, mask2e)] + + good_leptons = ak.mask(good_leptons,mask2lep) + + + #output['cutflow'][dataset]['selected Z pairs'] += ak.sum(ak.num(good_leptons)>0) + + selection.add('ee',ak.to_numpy(nele == 2)) + selection.add('mumu',ak.to_numpy(nmu == 2)) + + + #print(higgs_cand.type) + #print(ll_cand.type) + + # global already contains Vpt>60 as the lower bound + # global also has higgs_cand.mass<250 + req_sr_Zll = (ll_cand.mass > 75) & (ll_cand.mass < 105) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4) + # flip H mass, otherwise same + req_cr_Zcc = (ll_cand.mass > 85) & (ll_cand.mass < 97) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & ~((higgs_cand.mass>=50) & (higgs_cand.mass<=200)) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB>0.4) + # Note: m_ll requirement not in AN, but in AT + req_cr_Z_LF = (ll_cand.mass > 75) & (ll_cand.mass < 105) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL<0.225) & (leading.btagDeepFlavCvB>0.4) + + req_cr_Z_HF = (ll_cand.mass > 85) & (ll_cand.mass < 97) \ + & (higgs_cand.delta_phi(ll_cand)>2.5) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4) + + req_cr_t_tbar = ~((ll_cand.mass>0) & (ll_cand.mass<10)) & ~((ll_cand.mass>75) & (ll_cand.mass<120)) \ + & (higgs_cand.mass>=50) & (higgs_cand.mass<=200) \ + & (leading.btagDeepFlavCvL>0.225) & (leading.btagDeepFlavCvB<0.4) + + req_sr_Zll_vpt_low = req_global & req_sr_Zll & (ll_cand.pt<150) + # print(ll_cand.pt<150) + # print(ak.any(ll_cand.pt<150, axis=-1) + # print(req_sr_Zll_vpt_low) + req_sr_Zll_vpt_high = req_global & req_sr_Zll & (ll_cand.pt>150) + # print(ll_cand.pt>150) + # print(req_sr_Zll_vpt_high) + # print(len(req_sr_Zll_vpt_low)) + # print(len(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high)) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low))) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low == req_sr_Zll_vpt_high))) + + req_cr_Zcc_vpt_low = req_global & req_cr_Zcc & (ll_cand.pt<150) & (ll_cand.pt>50) + # print(req_sr_Zll_vpt_low) + req_cr_Zcc_vpt_high = req_global & req_cr_Zcc & (ll_cand.pt>150) + # print(req_sr_Zll_vpt_high) + # print(np.sum(ak.to_numpy(req_sr_Zll_vpt_low & req_sr_Zll_vpt_high))) + + req_cr_Z_LF_vpt_low = req_global & req_cr_Z_LF & (ll_cand.pt<150) + req_cr_Z_LF_vpt_high = req_global & req_cr_Z_LF & (ll_cand.pt>150) + + req_cr_Z_HF_vpt_low = req_global & req_cr_Z_HF & (ll_cand.pt<150) + req_cr_Z_HF_vpt_high = req_global & req_cr_Z_HF & (ll_cand.pt>150) + + req_cr_t_tbar_vpt_low = req_global & req_cr_t_tbar & (ll_cand.pt<150) + req_cr_t_tbar_vpt_high = req_global & req_cr_t_tbar & (ll_cand.pt>150) + + + #prob not necessary + #selection.add('SR',ak.to_numpy(req_sr_Zll)) + + selection.add('SR_2LL',ak.to_numpy(req_sr_Zll_vpt_low)) + selection.add('SR_2LH',ak.to_numpy(req_sr_Zll_vpt_high)) + selection.add('CR_Zcc_2LL',ak.to_numpy(req_cr_Zcc_vpt_low)) + selection.add('CR_Zcc_2LH',ak.to_numpy(req_cr_Zcc_vpt_high)) + selection.add('CR_Z_LF_2LL',ak.to_numpy(req_cr_Z_LF_vpt_low)) + selection.add('CR_Z_LF_2LH',ak.to_numpy(req_cr_Z_LF_vpt_high)) + selection.add('CR_Z_HF_2LL',ak.to_numpy(req_cr_Z_HF_vpt_low)) + selection.add('CR_Z_HF_2LH',ak.to_numpy(req_cr_Z_HF_vpt_high)) + selection.add('CR_t_tbar_2LL',ak.to_numpy(req_cr_t_tbar_vpt_low)) + selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high)) + + + + + + # ================================================================================= + # + # # Calculate and store weights & factors + # + # --------------------------------------------------------------------------------- + + # there is also nProcEvents, which might be related to nEvents by some factor + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc + # there are some more calculations related to weights, e.g. + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/HelperClasses/SampleContainer.cc#L115-L154 + + # ToDo: + # [ ] LHEScaleWeight ?? + # [ ] intWeight - is this only relevant when running over the post-processed samples, or already on top of Nano+AK15? + # [x] genWeight + # [ ] PrefireWeight - (for 2016+2017) see also: + # https://github.com/mastrolorenzo/AnalysisTools-1/blob/master/plugins/VHccAnalysis.cc#L2099-L2113 + # [ ] weight_PU + # [ ] weight_ptEWK + # [(x)] Lep_SF - but I'm not sure about EOY / UL compatibility + # [ ] recoWReWeight + # [ ] WJetNLOWeight + # [ ] cTagWeight - later, also including up/down syst + # [ ] weight_mettrigSF + # [ ] weight_puid - not the same as _PU + # [ ] weight_subptEWKnnlo - find out what "SubGen" is + # + # [ ] LOtoNLOWeightBjetSplitEtabb + # [ ] WPtCorrFactor + # [ ] ZPtCorrFactor + + + + + # running over more than just the Double[] datasets, but still requiring the same trigger + # not sure if correct + if 'DoubleEG' in dataset or 'Electron' in dataset: + output['cutflow'][dataset]['trigger'] += ak.sum(trigger_ee) + elif 'Muon' in dataset : + output['cutflow'][dataset]['trigger'] += ak.sum(trigger_mm) + + + # Successively add another cut w.r.t. previous line, looks a bit like N-1 histograms + output['cutflow'][dataset]['jet selection'] += ak.sum(njet>=2) + output['cutflow'][dataset]['global selection'] += ak.sum(req_global) + output['cutflow'][dataset]['signal region'] += ak.sum(req_global & req_sr_Zll) + output['cutflow'][dataset]['signal region & ee or mumu'] += ak.sum(req_global & req_sr_Zll & ( ((nele == 2) & trigger_ee) | ((nmu == 2) & trigger_mm))) + output['cutflow'][dataset]['signal ee'] += ak.sum(req_global & req_sr_Zll & (nele == 2) & trigger_ee) + output['cutflow'][dataset]['signal mumu'] += ak.sum(req_global & req_sr_Zll & (nmu == 2) & trigger_mm) + + + lepflav = ['ee','mumu'] + reg = ['SR_2LL','SR_2LH', + 'CR_Zcc_2LL','CR_Zcc_2LH', + 'CR_Z_LF_2LL','CR_Z_LF_2LH', + 'CR_Z_HF_2LL','CR_Z_HF_2LH', + 'CR_t_tbar_2LL','CR_t_tbar_2LH'] + + #print(possible_flavSplits) + list_weights = [] + lists_of_vars = {} + names = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_mass', 'Z_mass_gen', 'Z_pt_gen', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_phi_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + modes = ['low_ee', 'high_ee', 'low_mumu', 'high_mumu'] + for name in names: + for mode in modes: + lists_of_vars[f'{name}_{mode}'] = [] + ''' + lists_of_vars = {'wei': [], + 'Higgs_mass': [], + 'Higgs_pt': [], + 'Z_pt': [], + 'jjVptratio': [], + 'CvsL_max': [], + 'CvsL_min': [], + 'CvsB_max': [], + 'CvsB_min': [], + 'pt_lead': [], + 'pt_sublead': [], + 'del_phi_jjV': [], + 'del_R_jj': [], + 'del_eta_jj': [], + 'del_phi_ll': [], + 'del_eta_ll': [], + 'del_phi_l2_subleading': [], + 'del_phi_l2_leading': [] + } + ''' + #### write into histograms (i.e. write output) + for histname, h in output.items(): + for s in possible_flavSplits: + dataset_renamed = dataset if s == 'already_split_sample' else dataset + s + for ch in lepflav: + for r in reg: + cut = selection.all('lepsel', + 'jetsel', + 'global_selection', + 'metfilter', + 'lumi', + r, + ch, + s, + 'trigger_%s'%(ch)) + llcut = ll_cand[cut] + # this next line is necessary if running with multiple possible ll candidates + #llcut = llcut[:,0] + + lep1cut = llcut.lep1 + lep2cut = llcut.lep2 + #print(self._version) + if not isRealData and not self._debug: + #print('not data, not test') + if ch == 'ee': + lepsf = eleSFs(lep1cut, self._year, self._corr) * eleSFs(lep2cut, self._year, self._corr) + elif ch == 'mumu': + lepsf = muSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr) + ''' + # This would be emu channel, which does not exist in the VHcc Zll case + else: + lepsf = np.where(lep1cut.lep_flav == 11, + eleSFs(lep1cut, self._year, self._corr) * muSFs(lep2cut, self._year, self._corr), + 1.) \ + * np.where(lep1cut.lep_flav == 13, + eleSFs(lep2cut, self._year, self._corr) * muSFs(lep1cut, self._year, self._corr), + 1.) + ''' + else : + #lepsf = weights.weight()[cut] + # AS: if I understand correctly, this only works because in case of data, weights are identically 1 for every entry + # otherwise this would double count the weights in a later step (where lepsf gets multiplied by the weights!) + lepsf = ak.full_like(weights.weight()[cut], 1) + #print(lepsf) + # print(weights.weight()[cut]*lepsf) + # print(lepsf) + ''' + if self._export_array and not isRealData: + if ch == 'ee' and r == 'SR_2LL' and s == '_cc': + eell_cand = ak.zip({ + "Higgs_mass" : higgs_cand['mass'][cut] * lepsf, + #"jet2" : subleading_with_fsr, + #"pt": (leading_with_fsr + subleading_with_fsr).pt, + #"eta": (leading_with_fsr + subleading_with_fsr).eta, + #"phi": (leading_with_fsr + subleading_with_fsr).phi, + #"mass": (leading_with_fsr + subleading_with_fsr).mass, + }) + print(eell_cand) + ''' + if 'leading_jetflav_' in histname and 'sub' not in histname: + #print(dir(leading)) + #print(h.axes) + names = [ax.name for ax in h.axes] + fields = {l: normalize(leading[histname.replace('leading_jetflav_','')], + cut) for l in names if l in dir(leading)} + #print(fields) + #sys.exit() + if isRealData: + flavor = ak.zeros_like(normalize(leading['pt'],cut)) + else: + flavor = normalize(leading.hadronFlavour,cut) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + flav = flavor, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'subleading_jetflav_' in histname: + #print(dir(subleading)) + names = [ax.name for ax in h.axes] + fields = {l: normalize(subleading[histname.replace('subleading_jetflav_','')], + cut) for l in names if l in dir(subleading)} + if isRealData: + flavor = ak.zeros_like(normalize(subleading['pt'],cut)) + else: + flavor = normalize(subleading.hadronFlavour,cut) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + flav = flavor, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'lep1_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(lep1cut[histname.replace('lep1_','')]), + np.nan) for l in names if l in dir(lep1cut)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'lep2_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(lep2cut[histname.replace('lep2_','')]), + np.nan) for l in names if l in dir(lep2cut)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + #elif 'MET_' in histname: + # fields = {l: normalize(events.MET[histname.replace('MET_','')], + # cut) for l in names if l in dir(events.MET)} + # h.fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # **fields, + # weight = weights.weight()[cut] * lepsf) + elif 'll_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: ak.fill_none(flatten(llcut[histname.replace('ll_','')]), + np.nan) for l in names if l in dir(llcut)} + #print(max(llcut['pt'])) + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + elif 'jj_' in histname: + names = [ax.name for ax in h.axes] + fields = {l: normalize(higgs_cand[histname.replace('jj_','')], + cut) for l in names if l in dir(higgs_cand)} + h.fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + **fields, + weight = weights.weight()[cut] * lepsf) + else: + output['nj'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nj = normalize(ak.num(jet_conditions),cut), + weight = weights.weight()[cut]*lepsf) + # check? + output['nAddJets302p5_puid'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nAddJets302p5_puid = normalize(ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ), + cut), + weight = weights.weight()[cut]*lepsf) + # check? + output['nAddJetsFSRsub302p5_puid'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + nAddJetsFSRsub302p5_puid = normalize(ak.where((ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ) + -ak.num((~jet_conditions) & (fsr_conditions))) > 0, + (ak.where((ak.num(jet_conditions) > 2), + (ak.num(jet_conditions)-2), + (ak.zeros_like(ak.num(jet_conditions))) + ) + -ak.num((~jet_conditions) & (fsr_conditions))), + (ak.zeros_like(ak.num(jet_conditions)))), + cut), + weight = weights.weight()[cut]*lepsf) + #if not isRealData: + # output['weight_full'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # weight_full = weights.weight()[cut]*lepsf) + # output['genweight'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # genWeight = events.genWeight[cut]) + # output['sign_genweight'].fill( + # datasetSplit = dataset_renamed, + # lepflav = ch, + # region = r, + # genWeight_by_abs = (events.genWeight/abs(events.genWeight))[cut]) + output['jjVPtRatio'].fill( + datasetSplit = dataset_renamed, + lepflav = ch, + region = r, + jjVPtRatio = (normalize(higgs_cand['pt'], + cut) / ak.fill_none(flatten(llcut['pt']), + np.nan)), + weight = weights.weight()[cut] * lepsf) + if self._export_array and not isRealData: + import pandas as pd + #output['array'][dataset]['weight'] += processor.column_accumulator( + # ak.to_numpy(weights.weight()[cut] * lepsf) + # ) + + list_weights.append(ak.to_numpy(weights.weight()[cut] * lepsf)) + + roi = ['SR_2LL','SR_2LH'] + lepflav_chosen = ['ee','mumu'] + names_dict = {'wei': weights.weight()[cut]* lepsf , #weights.weight()[cut] * lepsf + 'Higgs_mass': higgs_cand['mass'][cut], + 'Higgs_pt': higgs_cand['pt'][cut], + 'Z_mass': ll_cand['mass'][cut], + 'Z_mass_gen': ll_cand_gen['mass'][cut], + 'Z_pt_gen': ll_cand_gen['pt'][cut], + 'Z_pt': ll_cand['pt'][cut], + 'jjVptratio': (higgs_cand['pt'][cut])/ (ll_cand['pt'][cut]), + 'CvsL_max': leading_with_fsr['jet1']['btagDeepFlavCvL'][cut], + 'CvsL_min': subleading_with_fsr['jet1']['btagDeepFlavCvL'][cut], + 'CvsB_max': leading_with_fsr['jet1']['btagDeepFlavCvB'][cut], + 'CvsB_min': subleading_with_fsr['jet1']['btagDeepFlavCvB'][cut], + 'pt_lead': leading_with_fsr['jet1']['pt'][cut], + 'pt_sublead': subleading_with_fsr['jet1']['pt'][cut], + 'del_phi_jjV': np.abs((higgs_cand[cut]).delta_phi(ll_cand[cut])), + 'del_R_jj': np.abs((higgs_cand['jet1'][cut]).delta_r(higgs_cand['jet2'][cut])), + 'del_eta_jj': np.abs((higgs_cand['jet1']['eta'][cut]) - (higgs_cand['jet2']['eta'][cut])), + 'del_phi_jj': np.abs((higgs_cand['jet1'][cut]).delta_phi(higgs_cand['jet2'][cut])), + 'del_phi_ll': np.abs((ll_cand['lep1'][cut]).delta_phi(ll_cand['lep2'][cut])), + 'del_eta_ll': np.abs((ll_cand['lep1']['eta'][cut]) - (ll_cand['lep2']['eta'][cut])), + 'del_phi_l2_subleading': np.abs((ll_cand['lep2'][cut]).delta_phi(higgs_cand['jet2'][cut])), + 'del_phi_l2_leading': np.abs((ll_cand['lep2'][cut]).delta_phi(higgs_cand['jet1'][cut])) + } + if ch in lepflav_chosen and r in roi: + if ch == 'ee': + if r == 'SR_2LL': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_low_ee'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_low_ee'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif r == 'SR_2LH': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_high_ee'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_high_ee'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif ch == 'mumu': + if r == 'SR_2LL': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_low_mumu'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_low_mumu'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + elif r == 'SR_2LH': + for var_name, var_value in names_dict.items(): + lists_of_vars[f'{var_name}_high_mumu'].append(ak.to_numpy(var_value)) + #output['array'][dataset][f'{var_name}_high_mumu'] += processor.column_accumulator( + # ak.to_numpy(var_value) + # ) + + + + ### + regression, kinfit ??? + list_weights = np.array([item for sublist in list_weights for item in sublist]) + #print(list_weights) + #print(lists_of_vars) + for v_name in lists_of_vars.keys(): + lists_of_vars[v_name] = np.array([item for sublist in lists_of_vars[v_name] for item in sublist]) + #print(lists_of_vars) + + ''' + try: + df_weights = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv') + except FileNotFoundError: + df_weights = pd.DataFrame([], columns = ['weights']) + ''' + try: + weights_array = np.load(f'{folder_save}/{dataset}/{filename}/test_weights_full.npy') + except FileNotFoundError: + weights_array = np.array([]) + + df_wei = pd.DataFrame([], columns = ['weights']) + df_wei['weights'] = list_weights + weight = np.array(list_weights) + + + #df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True) + #df_wei.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights.csv', sep=',', encoding='utf-8', index=False) + #df_weights_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_save_weights_full.csv', sep=',', encoding='utf-8', index=False) + weights_full = np.concatenate((weights_array, weight), axis = None) + np.save(f'{folder_save}/{dataset}/{filename}/test_weights_full.npy', weights_full, allow_pickle = False) + + ''' + try: + df_else_everything = pd.read_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv') + except FileNotFoundError: + df_else_everything = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()]) + df_else = pd.DataFrame([], columns = [v_name for v_name in lists_of_vars.keys()]) + #print(df_else) + for var in lists_of_vars.keys(): + df_else[var] = pd.Series(lists_of_vars[var]) + ''' + + + for var in lists_of_vars.keys(): + try: + else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy') + except FileNotFoundError: + else_var_array = np.array([]) + finally: + else_v_curr_array = np.array(lists_of_vars[var]) + else_var_full_array = np.concatenate((else_var_array, else_v_curr_array), axis = None) + np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy', else_var_full_array, allow_pickle = False) + + #df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True) + + #df_else.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights.csv', sep=',', encoding='utf-8', index=False) + #df_else_full.to_csv(f'{folder_save}/{dataset}/{filename}/test_else_save_no_weights_full.csv', sep=',', encoding='utf-8', index=False) + + return {dataset: output} + + def postprocess(self, accumulator): + #print(accumulator) + return accumulator diff --git a/cfg_VHcc_mod.py b/cfg_VHcc_mod.py new file mode 100644 index 0000000..b9d8e20 --- /dev/null +++ b/cfg_VHcc_mod.py @@ -0,0 +1,163 @@ +# Local Variables: +# python-indent-offset: 4 +# End: + +from VHcc.workflows.Zll_process_newHist_pandas_small_update_isolation import ( + NanoProcessor as VH_Zll, +) + +cfg = { + "userconfig": {'version':'test_nolepsf'}, + "dataset": { + "jsons": [ + "src/VHcc/metadata/bg_rwth_test_new_10.json"], + "campaign": "UL17", + "year": "2017", + "filter": { + "samples": [ + "DYJetsToLL_nlo_vau_bg" + #"ZHToCC_vau_sig" + #"DYJetsToLL_nlo", + #"DY1ToLL_PtZ-250To400", + #"DY1ToLL_PtZ-50To150", + #"DY1ToLL_PtZ-150To250", + #"DY1ToLL_PtZ-400ToInf", + #"DY2ToLL_PtZ-50To150", + #"DY2ToLL_PtZ-150To250", + #"DY2ToLL_PtZ-250To400", + #"DY2ToLL_PtZ-400ToInf", + #"", + #"", + #"", + ], + "samples_exclude": [], + }, + }, + # Input and output files + "workflow": VH_Zll, + "output": "output_vhcc_zll", + "run_options": { + #"executor": "parsl/condor/naf_lite", + "executor": "parsl/condor", + #"executor": "futures", + "workers": 1, + "scaleout": 150, + "walltime": "01:00:00", + "mem_per_worker": 2, # GB + "chunk": 50000, + "max": None, + "skipbadfiles": True, + "voms": None, + "limit": 80, + "retries": 20, + "splitjobs": False, + "requirements": ( + '( Machine != "lx1b02.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a03.physik.rwth-aachen.de") && ' + '( Machine != "lx3a05.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a06.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a09.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a13.physik.rwth-aachen.de") && ' + '( Machine != "lx3a14.physik.rwth-aachen.de") && ' + '( Machine != "lx3a15.physik.rwth-aachen.de") && ' + '( Machine != "lx3a23.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a25.physik.rwth-aachen.de") && ' + '( Machine != "lx3a27.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a46.physik.rwth-aachen.de") && ' + '( Machine != "lx3a44.physik.rwth-aachen.de") && ' + '( Machine != "lx3a47.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a55.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3a56.physik.rwth-aachen.de") && ' + '( Machine != "lx3b08.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b09.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b13.physik.rwth-aachen.de") && ' + '( Machine != "lx3b18.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b24.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b29.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b32.physik.rwth-aachen.de") && ' + '( Machine != "lx3b33.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b34.physik.rwth-aachen.de") && ' + '( Machine != "lx3b41.physik.rwth-aachen.de") && ' + '( Machine != "lx3b46.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b47.physik.rwth-aachen.de") && ' + '( Machine != "lx3b48.physik.rwth-aachen.de") && ' + '( Machine != "lx3b49.physik.rwth-aachen.de") && ' + '( Machine != "lx3b52.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b55.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b57.physik.rwth-aachen.de") && ' + '( Machine != "lx3b62.physik.rwth-aachen.de") && ' + '( Machine != "lx3b66.physik.rwth-aachen.de") && ' + '( Machine != "lx3b68.physik.RWTH-Aachen.de") && ' + '( Machine != "lx3b69.physik.rwth-aachen.de") && ' + '( Machine != "lx3b70.physik.rwth-aachen.de") && ' + '( Machine != "lx3b71.physik.rwth-aachen.de") && ' + '( Machine != "lx3b99.physik.rwth-aachen.de") && ' + '( Machine != "lxblade01.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade02.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade03.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade04.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade05.physik.rwth-aachen.de") && ' + '( Machine != "lxblade06.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade07.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade08.physik.rwth-aachen.de") && ' + '( Machine != "lxblade09.physik.rwth-aachen.de") && ' + '( Machine != "lxblade10.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade11.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade12.physik.rwth-aachen.de") && ' + '( Machine != "lxblade13.physik.rwth-aachen.de") && ' + '( Machine != "lxblade14.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade15.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade16.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade17.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade18.physik.rwth-aachen.de") && ' + '( Machine != "lxblade19.physik.rwth-aachen.de") && ' + '( Machine != "lxblade20.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade21.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade22.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade23.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade24.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade25.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade26.physik.rwth-aachen.de") && ' + '( Machine != "lxblade27.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade28.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade29.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade30.physik.RWTH-Aachen.de") && ' + '( Machine != "lxblade31.physik.rwth-aachen.de") && ' + '( Machine != "lxblade32.physik.rwth-aachen.de") && ' + '( Machine != "lxcip01.physik.rwth-aachen.de") && ' + '( Machine != "lxcip02.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip05.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip06.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip09.physik.rwth-aachen.de") && ' + '( Machine != "lxcip10.physik.rwth-aachen.de") && ' + '( Machine != "lxcip11.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip12.physik.rwth-aachen.de") && ' + '( Machine != "lxcip14.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip15.physik.rwth-aachen.de") && ' + '( Machine != "lxcip16.physik.rwth-aachen.de") && ' + '( Machine != "lxcip17.physik.rwth-aachen.de") && ' + '( Machine != "lxcip18.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip19.physik.rwth-aachen.de") && ' + '( Machine != "lxcip24.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip25.physik.rwth-aachen.de") && ' + '( Machine != "lxcip26.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip27.physik.rwth-aachen.de") && ' + '( Machine != "lxcip28.physik.rwth-aachen.de") && ' + '( Machine != "lxcip29.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip30.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip31.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip32.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip34.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip35.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip50.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip51.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip52.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip53.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip54.physik.RWTH-Aachen.de") && ' + '( Machine != "lxcip55.physik.rwth-aachen.de") && ' + '( Machine != "lxcip56.physik.rwth-aachen.de") && ' + '( Machine != "lxcip57.physik.rwth-aachen.de") && ' + '( Machine != "lxcip58.physik.rwth-aachen.de") && ' + '( Machine != "lxcip59.physik.rwth-aachen.de")'), + }, +} diff --git a/xgb_test.py b/xgb_test.py new file mode 100644 index 0000000..80ccb58 --- /dev/null +++ b/xgb_test.py @@ -0,0 +1,456 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText + +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_03_14' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +######################################################################### +######### Reading the bg as output and signal as signal ################# +######################################################################### +output_names = ['output_vhcc_zll_v81_bg_5_files_1_chunk', 'output_vhcc_zll_v81_bg_5_files_2_chunk', 'output_vhcc_zll_v81_bg_5_files_3_chunk', 'output_vhcc_zll_v81_bg_5_files_4_chunk', 'output_vhcc_zll_v81_bg_5_files_5_chunk', 'output_vhcc_zll_v81_bg_5_files_6_chunk', 'output_vhcc_zll_v81_bg_5_files_7_chunk', 'output_vhcc_zll_v81_bg_5_files_8_chunk', 'output_vhcc_zll_v81_bg_5_files_9_chunk', 'output_vhcc_zll_v81_bg_6_files_10_chunk'] +signal_names = ['output_vhcc_zll_v54_signal_35_files'] + +outputs = [load(f"{name}/output.coffea") for name in output_names] +signals = [load(f"{name}/output.coffea") for name in signal_names] + +outputs = [out['DYJetsToLL_nlo_vau_bg'] for out in outputs] +signals = [sig['ZHToCC_vau_sig'] for sig in signals] + + +output=load('output_vhcc_zll_v47_bg_2_files/output.coffea') +signal=load('output_vhcc_zll_v54_signal_5_files/output.coffea') + +output = output['DYJetsToLL_nlo_vau_bg'] +signal = signal['ZHToCC_vau_sig'] +#print(output['array']) +######################################################################### +########## Testing bg to see the structure ############################## +######################################################################### +for f in output['array'].keys(): + print(f) + try: + for k in f.keys(): + print(k) + except AttributeError: + print("No keys found") +######################################################################### + + +######################################################################### +######### Testing sig to see the structure ############################## +######################################################################### +for f in signal['array'].keys(): + print(f) + try: + for k in f.keys(): + print(k) + except AttributeError: + print("No keys found") +######################################################################### + +######################################################################### +###### Reading the arrays into collect_var dictionary for sig ########### +######################################################################### +names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +def output_collect_sig(sig): + sumw_sig = {} + collect_var_sig={} + varlist_sig = ['weight'] + + names_sig = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + for name in names_sig: + varlist_sig.append(f'{name}_low_ee') + varlist_sig.append(f'{name}_high_ee') + varlist_sig.append(f'{name}_low_mumu') + varlist_sig.append(f'{name}_high_mumu') + + + for s in sig['array'].keys(): + #iterated samples inside coffea file + if s not in sumw_sig.keys():sumw_sig[s]=sig['array'][s]['sumw'] + else:sumw_sig[s] += sig['array'][s]['sumw'] + + if s not in collect_var_sig.keys():collect_var_sig[s]={} + # iterate regions(SR, CR, for H+c) + for var in varlist_sig: + # get arrays for each variable + if var=='BDT' : continue + if var not in list(collect_var_sig[s].keys()):collect_var_sig[s][var]=sig['array'][s][var].value + else:collect_var_sig[s][var]=np.concatenate((collect_var_sig[s][var],sig['array'][s][var].value)) + + #print(sumw) + #print(collect_var) + for var in collect_var_sig.keys(): + #print(var) + for key in collect_var_sig[var].keys(): + #print(key) + #print(collect_var_sig[var][key]) + #print(len(collect_var_sig[var][key])) + pass + return varlist_sig, collect_var_sig +big_signal_varlist = [] +big_signal_variable_collection = [] +for coffea in signals: + varlist_sig, collect_var_sig = output_collect_sig(coffea) + big_signal_varlist.append(varlist_sig) + big_signal_variable_collection.append(collect_var_sig) + +varlist_sig, collect_var_sig = output_collect_sig(signal) + +#print(varlist_sig) +#print(big_signal_varlist) +#print(collect_var_sig) +#print(big_signal_variable_collection) +######################################################################### + + +######################################################################### +###### Reading the arrays into collect_var dictionary for bg ############ +######################################################################### +names_bg = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +def output_collect_bg(bg_file): + sumw_bg = {} + collect_var_bg={} + varlist_bg = ['weight'] + + names_bg = ['wei','Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + for name in names_bg: + varlist_bg.append(f'{name}_low_ee') + varlist_bg.append(f'{name}_high_ee') + varlist_bg.append(f'{name}_low_mumu') + varlist_bg.append(f'{name}_high_mumu') + + + for s in bg_file['array'].keys(): + #iterated samples inside coffea file + if s not in sumw_bg.keys():sumw_bg[s]=bg_file['array'][s]['sumw'] + else:sumw_bg[s] += bg_file['array'][s]['sumw'] + + if s not in collect_var_bg.keys():collect_var_bg[s]={} + # iterate regions(SR, CR, for H+c) + for var in varlist_bg: + # get arrays for each variable + if var=='BDT' : continue + if var not in list(collect_var_bg[s].keys()):collect_var_bg[s][var]=bg_file['array'][s][var].value + else:collect_var_bg[s][var]=np.concatenate((collect_var_bg[s][var],bg_file['array'][s][var].value)) + + #print(sumw) + #print(collect_var) + for var in collect_var_bg.keys(): + #print(collect_var[var]) + for key in collect_var_bg[var].keys(): + print(key) + #print(collect_var[var][key]) + #print(len(collect_var[var][key])) + return varlist_bg, collect_var_bg +varlist_bg, collect_var_bg = output_collect_bg(output) +big_bg_varlist = [] +big_bg_variable_collection = [] +for coffea in outputs: + varlist_bg, collect_var_bg = output_collect_bg(coffea) + big_bg_varlist.append(varlist_bg) + big_bg_variable_collection.append(collect_var_bg) + +#print(varlist_bg) +#print(big_bg_varlist) +#print(collect_var_bg) +#print(big_bg_variable_collection) +######################################################################### + + +######################################################################### +## Mergemap - dictionary with files, associated with their categories ### +######################################################################### +mergemap={'signal': ['ZHToCC_vau_sig'], 'bg': ['DYJetsToLL_nlo_vau_bg']} +trainvar = [] +#for s in varlist_sig: +# trainvar.append(f'{s}_signal') +#for b in varlist_bg: +# trainvar.append(f'{b}_background') +trainvar = varlist_sig +#print(trainvar) +MCvar={} +weivar={} + +for var in trainvar : + MCbkgLM = [] + MCvar[var]={} + for m in mergemap: + tmpml = [] + tmpwei = [] + if m == 'signal': + for colvarsig in big_signal_variable_collection: + tmpml=np.concatenate((tmpml,colvarsig[mergemap[m][0]][var])) + + tmpwei=np.concatenate((tmpwei,colvarsig[mergemap[m][0]]['weight'])) + elif m == 'bg': + for colvarbag in big_bg_variable_collection: + tmpml=np.concatenate((tmpml,colvarbag[mergemap[m][0]][var])) + + tmpwei=np.concatenate((tmpwei,colvarbag[mergemap[m][0]]['weight'])) + MCvar[var][m]=tmpml + weivar[m]=tmpwei + MCbkgLM+=[tmpml] + +print(MCvar.keys()) +print(MCvar['Higgs_mass_low_ee'].keys()) +len_var = [] +len_var_bg = [] + +df_sig = pd.DataFrame([], columns = [f'{col}_low_ee' for col in names_sig]) +print(df_sig) +for var in MCvar.keys(): + if '_low_ee' in var: + len_var.append(len(MCvar[var]['signal'])) + df_sig[var] = MCvar[var]['signal'] + df_sig['target'] = np.ones(np.max(len_var)) +print(df_sig) +print(np.max(len_var), np.min(len_var)) + + +df_bg = pd.DataFrame([], columns = [f'{col}_low_ee' for col in names_sig]) +print(df_bg) +for var in MCvar.keys(): + if '_low_ee' in var: + len_var_bg.append(len(MCvar[var]['bg'])) + df_bg[var] = MCvar[var]['bg'] + df_bg['target'] = np.zeros(np.max(len_var_bg)) +print(df_bg) +print(np.max(len_var_bg), np.min(len_var_bg)) + +df = pd.concat([df_sig, df_bg], ignore_index = True) +print(df) +print(df.info()) +df.to_csv('xgb_training_dataset_low_ee.csv', sep=',', encoding='utf-8', index=False) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(e_1, e_2)$', '$\Delta\eta(e_1, e_2)$', + '$\Delta\Phi (e_{subleading}, jet_{subleading})$', '$\Delta\Phi (e_{subleading}, jet_{leading})$'] + +c = 0 +for col in names_sig[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_low_ee'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_low_ee'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_low_ee'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_low_ee'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{col}_low_ee.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_low_ee'][:len_sig]),bins = 80, weights = np.array(df['wei_low_ee'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_low_ee'][len_sig:]),bins =80, weights = np.array(df['wei_low_ee'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_low_ee'][:len_sig]),bins = 80, weights = np.array(df['wei_low_ee'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_low_ee'][len_sig:]),bins =80, weights = np.array(df['wei_low_ee'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} low ee') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_low_ee'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"plot/{folder_save}/compare_{col}_low_ee.pdf") + fig.savefig(f"plot/{folder_save}/compare_{col}_low_ee.jpg") + c += 1 + + +X = df.drop("target", axis = 1) +print(X) +X = X.drop("wei_low_ee", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea.py b/xgb_test_no_coffea.py new file mode 100644 index 0000000..16547aa --- /dev/null +++ b/xgb_test_no_coffea.py @@ -0,0 +1,605 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_04_19_later' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +paths_np = [str(x) for x in Path("./condor_signal_04_mid/ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) +#print(key_np) +for key in key_np.keys(): + key_np[key] = [np.load(element) for element in key_np[key]] +#print(key_np) + +key_np_full = {} +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) +#print(key_np_full) + +for key in key_np_full.keys(): + df_sig_full_np[key] = pd.Series(key_np_full[key]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) +print(df_s_new_np) +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np_back) +print(len(paths_np_back)) +df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_back_full_np) + +key_np_back = {} +for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) +#print(key_np_back) +for key in key_np_back.keys(): + key_np_back[key] = [np.load(element) for element in key_np_back[key]] +#print(key_np_back) + +key_np_full_back = {} +for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) +for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) +#print(key_np_full_back) + +for key in key_np_full_back.keys(): + df_back_full_np[key] = pd.Series(key_np_full_back[key]) +print(df_back_full_np) +df_b_new_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] +df_b_new_np = df_b_new_np.dropna() +print(df_b_new_np) + +len_var = [] +for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) +print(df_b_new_np) + +###################################################################################### + +df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +c = 0 +for col in names_sig[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(net_path +f"plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.pdf") + fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.pdf") + fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.pdf") + fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_only_xgb.py b/xgb_test_only_xgb.py new file mode 100644 index 0000000..7d4307a --- /dev/null +++ b/xgb_test_only_xgb.py @@ -0,0 +1,361 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +####################################################################################### +## Create the folder to save the data if it doesn't exist and read in the dataframe ### +####################################################################################### +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_04_11' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +df = pd.read_csv('xgb_training_dataset_low_ee.csv') + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + + +######################################################################################## +########## drop target from df and bring it to a separate column, drop weights ######### +######################################################################################## +X = df.drop("target", axis = 1) +print(X) +X = X.drop("wei_low_ee", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +######################################################################################## +################# GRID search attempt ################################################## +######################################################################################## +''' +from sklearn.model_selection import GridSearchCV + +### Creat the parameter grid +gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + +gbm = xgb.XGBRegressor() + +grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + +grid_mse.fit(X,y) + + +print("Best parameters found: ", grid_mse.best_params_) +print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) +''' + +######################################################################################## +############# An attempt to do hyperparameter tuning for the classifier fit ############ +######################################################################################## +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + +######################################################################################### +############# Create pipelines for xgb training ######################################### +######################################################################################### +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +######################################################################################### +############ split dataset into training and test ####################################### +######################################################################################### +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +############################################################################################################ +######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# +############################################################################################################ +cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101) +folds = [(train, test) for train, test in cv.split(X_train, y_train)] +#print(folds) +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +eta = 0.4 +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta} +with open(f"plot/{folder_save}/results_first.json", 'w') as outfile: + json.dump(results, outfile) + + + +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +for train, test in tqdm(folds, total = len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train,:], + label = y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label = y_train[test]) + model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +with open(f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +########################################################################################################## +############## plotting the ROC curves with uncertainties ################################################ +########################################################################################################## +kind = 'val' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff.jpg") +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff.pdf") + +''' +fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg") +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf") +''' +################################################################################################## +########## Actual hyperparameter tuning ########################################################## +################################################################################################## + +trials = Trials() + +#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +#print("The best hyperparameters are: ", "\n") +#print(best_hyperparams) + + + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance.jpg") + +feature_importance = model.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_no_coffea.py b/xgb_test_only_xgb_no_coffea.py new file mode 100644 index 0000000..dd3b1d1 --- /dev/null +++ b/xgb_test_only_xgb_no_coffea.py @@ -0,0 +1,399 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +####################################################################################### +## Create the folder to save the data if it doesn't exist and read in the dataframe ### +####################################################################################### +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_04_11' +roi = 'low_mumu' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(net_path + f'xgb_training_dataset_{roi}.csv') + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + + +######################################################################################## +########## drop target from df and bring it to a separate column, drop weights ######### +######################################################################################## +X = df.drop("target", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +######################################################################################## +################# GRID search attempt ################################################## +######################################################################################## +''' +from sklearn.model_selection import GridSearchCV + +### Creat the parameter grid +gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + +gbm = xgb.XGBRegressor() + +grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + +grid_mse.fit(X,y) + + +print("Best parameters found: ", grid_mse.best_params_) +print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) +''' + +######################################################################################## +############# An attempt to do hyperparameter tuning for the classifier fit ############ +######################################################################################## +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + +######################################################################################### +############# Create pipelines for xgb training ######################################### +######################################################################################### +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +######################################################################################### +############ split dataset into training and test ####################################### +######################################################################################### +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +############################################################################################################ +######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# +############################################################################################################ +cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101) +folds = [(train, test) for train, test in cv.split(X_train, y_train)] +#print(folds) +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_zero_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_weak_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +eta = 0.3 +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta} +with open(net_path + f"plot/{folder_save}/results_first.json", 'w') as outfile: + json.dump(results, outfile) + + + +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +for train, test in tqdm(folds, total = len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train,:], + label = y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label = y_train[test]) + model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal + model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal + model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + y_preds_zero_train = model_zero_train.predict(sets[i]) + y_preds_weak_train = model_weak_train.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train) + fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + results_zero_train[ds]['fpr'].append(fpr_zero) + results_zero_train[ds]['tpr'].append(tpr_zero) + results_zero_train[ds]['thresholds'].append(thresholds_zero) + results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) + results_weak_train[ds]['fpr'].append(fpr_weak) + results_weak_train[ds]['tpr'].append(tpr_weak) + results_weak_train[ds]['thresholds'].append(thresholds_weak) + results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train)) + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +with open(net_path + f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(net_path + f"plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(net_path + f"plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +########################################################################################################## +############## plotting the ROC curves with uncertainties ################################################ +########################################################################################################## +kind = 'val' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.jpg") +fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.pdf") + +''' +fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg") +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf") +''' +################################################################################################## +########## Actual hyperparameter tuning ########################################################## +################################################################################################## + +trials = Trials() + +#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +#print("The best hyperparameters are: ", "\n") +#print(best_hyperparams) + + + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") + +feature_importance = model.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded.py b/xgb_test_only_xgb_reloaded.py new file mode 100644 index 0000000..9f3d72d --- /dev/null +++ b/xgb_test_only_xgb_reloaded.py @@ -0,0 +1,294 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +folder_save = 'eval_23_03_07_1' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +df = pd.read_csv('xgb_training_dataset_low_ee.csv') + + +learning_rate = 0.3 + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + +X = df.drop("target", axis = 1) +print(X) +X = X.drop("wei_low_ee", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +with open(f"plot/{folder_save}/results_lr_{learning_rate}.json") as user_file: + file_contents = user_file.read() + +results = json.loads(file_contents) +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +#kind = 'val' +kind = 'test' +#kind = 'train' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +range_plot_x = [0,1] +range_plot_y = [0.2,1] + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = 'FPR (Background efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg") +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_eff_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf") + + +fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg") +fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded__lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf") + + + + +trials = Trials() + +#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +#print("The best hyperparameters are: ", "\n") +#print(best_hyperparams) + + + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(e_1, e_2)$', '$\Delta\eta(e_1, e_2)$', + '$\Delta\Phi (e_{subleading}, jet_{subleading})$', '$\Delta\Phi (e_{subleading}, jet_{leading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea.py b/xgb_test_only_xgb_reloaded_no_coffea.py new file mode 100644 index 0000000..bc91384 --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea.py @@ -0,0 +1,287 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_05_02' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv') + +roi = 'low_mumu' +learning_rate = 0.3 + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + +X = df.drop("target", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg") + fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf") + + +pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full") +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + +pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero') + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + +pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak') + +############################################################################################################################################################## + + +trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + +### Fit +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(net_path + f"plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg") +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea_var.py b/xgb_test_only_xgb_reloaded_no_coffea_var.py new file mode 100644 index 0000000..4451ef9 --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea_var.py @@ -0,0 +1,404 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_05_02' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv') + +roi = 'low_mumu' +learning_rate = 0.3 + +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +var = f'Higgs_mass_{roi}' + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + +X = df[var] +print(X) +print(X.info()) + +X_signal = df[var][df.target == 1] +X_bg = df[var][df.target == 0] + +y = df["target"] +print(y) + +y_signal = df["target"][df.target == 1] +y_bg = df["target"][df.target == 0] + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = pd.Index([], dtype = 'object') +num_cols = pd.Index([var], dtype = 'object') + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +#X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) +y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1)) +y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218) +X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218) +X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type, var): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg") + fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf") + + +pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full", var) +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + +pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero', var) + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + +pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak', var) + +############################################################################################################################################################## + + +trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10) + +### Fit +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +dtest = xgb.DMatrix(X_test, label = y_test) +dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig) +dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000, +model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_new_weak = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_new_zero = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + y_preds_new_weak = model_xgb_weak.predict(sets[i]) + y_preds_new_zero = model_xgb_zero.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak) + fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + results_new_weak[ds]['fpr'].append(fpr_new_weak) + results_new_weak[ds]['tpr'].append(tpr_new_weak) + results_new_weak[ds]['thresholds'].append(thresholds_new_weak) + results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) + results_new_zero[ds]['fpr'].append(fpr_new_zero) + results_new_zero[ds]['tpr'].append(tpr_new_zero) + results_new_zero[ds]['thresholds'].append(thresholds_new_zero) + results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero)) + +def pretty_ROC_Curve_var(results, kind, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") + +pretty_ROC_Curve_var(results_new, 'test', 'full', var) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) +predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)]) +predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +#importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +#importances = importances.sort_values(by = "Importance", ascending = False) +#importances = importances.set_index('Feature') +#print(importances) +#importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_{var}.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +'''names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']''' +names_sig = ['m(H)'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False) +plt.title('Classifier output') +plt.legend(['Signal', 'Background']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_sig_vs_bg.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_weak.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{var}_zero.jpg") + +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +with open(f"plot/{folder_save}/ROC.txt", "a") as myfile: + myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + " " + '\n') + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' From f3cfb344bae12e22984f8fd802a5408138f58ad1 Mon Sep 17 00:00:00 2001 From: ValVau <109755950+ValVau@users.noreply.github.com> Date: Wed, 31 May 2023 10:46:52 +0200 Subject: [PATCH 2/3] Add files via upload --- ...s_newHist_pandas_small_update_isolation.py | 241 +++++++++++++++++- 1 file changed, 233 insertions(+), 8 deletions(-) diff --git a/Zll_process_newHist_pandas_small_update_isolation.py b/Zll_process_newHist_pandas_small_update_isolation.py index 11de771..d7e32d0 100644 --- a/Zll_process_newHist_pandas_small_update_isolation.py +++ b/Zll_process_newHist_pandas_small_update_isolation.py @@ -585,9 +585,9 @@ def process(self, events): nEvents = len(events) print('Number of events: ', nEvents) if 'ZH' in dataset: - ttyp = 'signal_04_mid' + ttyp = 'signal_05_late' else: - ttyp = 'back_04_mid' + ttyp = 'back_05_late' folder_save = f'condor_{ttyp}' if not os.path.exists(f"./{folder_save}"): os.mkdir(f"./{folder_save}") @@ -595,8 +595,7 @@ def process(self, events): os.mkdir(f"./{folder_save}/{dataset}") if not os.path.exists(f"./{folder_save}/{dataset}/{filename}"): os.mkdir(f"./{folder_save}/{dataset}/{filename}") - with open(f"./{folder_save}/event_nr.txt", "a") as myfile: - myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + " " + '\n') + # As far as I understand, this looks like a neat way to give selections a name, # while internally, there are boolean arrays for all events @@ -786,7 +785,14 @@ def process(self, events): - + names_events = [] + values_events = [] + names_events.append("Filename") + names_events.append("Start") + names_events.append("Stop") + values_events.append(f"{filename}") + values_events.append(f"{start}") + values_events.append(f"{stop}") # ================================================================================= # @@ -802,21 +808,102 @@ def process(self, events): ## muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 #event_mu = events.Muon[ak.argsort(events.Muon.pt, axis=1, ascending=False)] event_mu = events.Muon + nEvent_mu = len(event_mu) + ################################################################################### + nEvent_mu = ak.sum(event_mu.looseId, axis = 1) + nEvent_mu = ak.sum((nEvent_mu == 2)) + names_events.append("Number of 2 mu events") + values_events.append(nEvent_mu) + ################################################################################### # looseId >= 1 or looseId seems to be the same... musel = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1) & (event_mu.pfRelIso04_all<0.25)) #(event_mu.looseId >= 1) (event_mu.mvaId >= 3) # but 25GeV and 0.06 for 1L, xy 0.05 z 0.2, &(abs(event_mu.dxy)<0.06)&(abs(event_mu.dz)<0.2) and tightId for 1L + + ################################################################################### + ############### Cutflow every single cut ########################################## + ################################################################################### + n_event_mu_pt = ak.sum((event_mu.pt > 20), axis = 1) ###, axis = 1 + n_event_mu_pt = ak.sum((n_event_mu_pt == 2)) + names_events.append("Number of mu events pt cut") + values_events.append(n_event_mu_pt) + ################################################################################### + n_event_mu_eta = ak.sum((abs(event_mu.eta) < 2.4), axis = 1) + n_event_mu_eta = ak.sum((n_event_mu_eta == 2)) + names_events.append("Number of mu events eta cut") + values_events.append(n_event_mu_eta) + ################################################################################### + n_event_mu_looseId = ak.sum((event_mu.looseId >= 1), axis = 1) + n_event_mu_looseId = ak.sum((n_event_mu_looseId == 2)) + names_events.append("Number of mu events looseId cut") + values_events.append(n_event_mu_looseId) + ################################################################################### + n_event_mu_iso = ak.sum((event_mu.pfRelIso04_all<0.25), axis = 1) + n_event_mu_iso = ak.sum((n_event_mu_iso == 2)) + names_events.append("Number of mu events iso cut") + values_events.append(n_event_mu_iso) + ################################################################################### + + ################################################################################### + ############### Cutflow cuts applied gradually #################################### + ################################################################################### + musel_pt_eta = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4)) + nmu_pt_eta = ak.sum(musel_pt_eta,axis=1) + names_events.append("Mu selection pt eta") + values_events.append(ak.sum(nmu_pt_eta == 2)) + ################################################################################### + musel_plus_looseid = ((event_mu.pt > 20) & (abs(event_mu.eta) < 2.4) & (event_mu.looseId >= 1)) + nmu_plus_looseid = ak.sum(musel_plus_looseid,axis=1) + names_events.append("Mu selection pt eta looseId") + values_events.append(ak.sum(nmu_plus_looseid == 2)) + ################################################################################### + event_mu = event_mu[musel] + n_event_mu_sel = len(event_mu) event_mu = event_mu[ak.argsort(event_mu.pt, axis=1, ascending=False)] event_mu["lep_flav"] = 13*event_mu.charge event_mu= ak.pad_none(event_mu,2,axis=1) nmu = ak.sum(musel,axis=1) + names_events.append("Final mu selection") + values_events.append(ak.sum(nmu == 2)) # ToDo: PtCorrGeoFit # ## Electron cuts ## # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2 #event_e = events.Electron[ak.argsort(events.Electron.pt, axis=1,ascending=False)] event_e = events.Electron + nEvent_ele = len(event_e) + ################################################################################### + nEvent_ele = ak.sum((abs(event_e.pt)>=0), axis = 1) + nEvent_ele = ak.sum((nEvent_ele == 2)) + names_events.append("Number of 2 ele events") + values_events.append(nEvent_ele) + ################################################################################### elesel = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1) & (event_e.pfRelIso03_all<0.25)) + ################################################################################### + + ################################################################################### + ############### Cutflow every single cut ########################################## + ################################################################################### + n_event_ele_pt = ak.sum((event_e.pt > 20), axis = 1) + n_event_ele_pt = ak.sum((n_event_ele_pt == 2)) + names_events.append("Number of ele events pt cut") + values_events.append(n_event_ele_pt) + ################################################################################### + n_event_ele_eta = ak.sum((abs(event_e.eta) < 2.5), axis = 1) + n_event_ele_eta = ak.sum((n_event_ele_eta == 2)) + names_events.append("Number of ele events eta cut") + values_events.append(n_event_ele_eta) + ################################################################################### + n_event_ele_mvaIso = ak.sum((event_e.mvaFall17V2Iso_WP90==1), axis = 1) + n_event_ele_mvaIso = ak.sum((n_event_ele_mvaIso == 2)) + names_events.append("Number of ele events mva Iso cut") + values_events.append(n_event_ele_mvaIso) + ################################################################################### + n_event_ele_pfrelIso = ak.sum((event_e.pfRelIso03_all<0.25), axis = 1) + n_event_ele_pfrelIso = ak.sum((n_event_ele_pfrelIso == 2)) + names_events.append("Number of ele events pf Rel Iso cut") + values_events.append(n_event_ele_pfrelIso) + ################################################################################### # but 30GeV and WP80 for 1L event_e = event_e[elesel] # something I saw in a recent presentation, and also in AT code: @@ -828,7 +915,26 @@ def process(self, events): event_e = event_e[ak.argsort(event_e.pt, axis=1,ascending=False)] event_e["lep_flav"] = 11*event_e.charge event_e = ak.pad_none(event_e,2,axis=1) + + ################################################################################### + ############### Cutflow cuts applied gradually #################################### + ################################################################################### + ele_pt_eta = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5)) + nele_pt_eta = ak.sum(ele_pt_eta,axis=1) + names_events.append("Ele selection pt eta") + values_events.append(ak.sum(nele_pt_eta == 2)) + ################################################################################### + esel_plus_mvaId = ((event_e.pt > 20) & (abs(event_e.eta) < 2.5) & (event_e.mvaFall17V2Iso_WP90==1)) + nele_plus_mvaId = ak.sum(esel_plus_mvaId,axis=1) + names_events.append("Ele selection pt eta mvaId") + values_events.append(ak.sum(nele_plus_mvaId == 2)) + ################################################################################### + + + nele = ak.sum(elesel,axis=1) + names_events.append("Final ele selection") + values_events.append(ak.sum(nele == 2)) # sorting after selecting should be faster (less computations on average) # for this channel (Zll / 2L) @@ -1022,7 +1128,12 @@ def deepflavcvsbtag(jet): #jets["btagDeepFlavCvL"] = deepflavcvsltag(jets) #jets["btagDeepFlavCvB"] = deepflavcvsbtag(jets) jets = jets[ak.argsort(jets.btagDeepFlavCvL, axis=1, ascending=False)] - + ################################################################################### + nEvent_jets = ak.sum((jets.btagDeepFlavCvL>=0), axis = 1) + nEvent_jets = ak.sum((nEvent_jets >= 2)) + names_events.append("Number of 2+ jet events") + values_events.append(nEvent_jets) + ################################################################################### # Jets are considered only if the following identification conditions hold, as mentioned in AN # - Here is some documentation related to puId and jetId: @@ -1030,9 +1141,89 @@ def deepflavcvsbtag(jet): # https://twiki.cern.ch/twiki/bin/viewauth/CMS/JetID jet_conditions = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis = 2) + + ################################################################################### + ############### Cutflow every single cut ########################################## + ################################################################################### + n_event_jet_eta = ak.sum((abs(jets.eta) < 2.4), axis = 1) + n_event_jet_eta = ak.sum((n_event_jet_eta >= 2)) + names_events.append("Number of jet events eta cut") + values_events.append(n_event_jet_eta) + ################################################################################### + n_event_jet_pt = ak.sum((jets.pt > 20), axis = 1) + n_event_jet_pt = ak.sum((n_event_jet_pt >= 2)) + names_events.append("Number of jet events pt cut") + values_events.append(n_event_jet_pt) + ################################################################################### + n_event_jet_puId = ak.sum((jets.puId > 0), axis = 1) + n_event_jet_puId = ak.sum((n_event_jet_puId >= 2)) + names_events.append("Number of jet events puId cut") + values_events.append(n_event_jet_puId) + ################################################################################### + n_event_jet_pt_strong = ak.sum((jets.pt>50), axis = 1) + n_event_jet_pt_strong = ak.sum((n_event_jet_pt_strong >= 2)) + names_events.append("Number of jet events pt strong cut") + values_events.append(n_event_jet_pt_strong) + ################################################################################### + n_event_jet_jetId = ak.sum((jets.jetId>5), axis = 1) + n_event_jet_jetId = ak.sum((n_event_jet_jetId >= 2)) + names_events.append("Number of jet events jet_id cut") + values_events.append(n_event_jet_jetId) + ################################################################################### + n_event_jet_lepton_clean_1 = ak.sum(ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2), axis = 1) + n_event_jet_lepton_clean_1 = ak.sum((n_event_jet_lepton_clean_1 >= 2)) + names_events.append("Number of jet events lepton clean 1 cut") + values_events.append(n_event_jet_lepton_clean_1) + ################################################################################### + n_event_jet_lepton_clean_2 = ak.sum(ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis =2), axis = 1) + n_event_jet_lepton_clean_2 = ak.sum((n_event_jet_lepton_clean_2 >= 2)) + names_events.append("Number of jet events lepton clean 2 cut") + values_events.append(n_event_jet_lepton_clean_2) + ################################################################################### + + ################################################################################### + ############### Cutflow cuts applied gradually #################################### + ################################################################################### + jets_pt_eta = ((abs(jets.eta) < 2.4) & (jets.pt > 20)) + njet_pt_eta = ak.sum(jets_pt_eta,axis=1) + names_events.append("Number of jets selection pt eta") + values_events.append(ak.sum(njet_pt_eta >= 2)) + ################################################################################### + jsel_plus_puId = ((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) + njets_plus_puId = ak.sum(jsel_plus_puId,axis=1) + names_events.append("Number of jets selection pt eta puId") + values_events.append(ak.sum(njets_plus_puId >= 2)) + ################################################################################### + jsel_plus_jetId = ((jets.pt>50) & (jets.jetId>5)) + njets_plus_jetId = ak.sum(jsel_plus_jetId,axis=1) + names_events.append("Number of jets selection pt jetId") + values_events.append(ak.sum(njets_plus_jetId >= 2)) + ################################################################################### + jsel_no_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ + | ((jets.pt>50) & (jets.jetId>5))) + njets_no_cleaning = ak.sum(jsel_no_cleaning,axis=1) + names_events.append("Number of jets full selection no cleaning") + values_events.append(ak.sum(njets_no_cleaning >= 2)) + ################################################################################### + jsel_one_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ + | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep1)>0.4, axis =2) + njets_one_cleaning = ak.sum(jsel_one_cleaning,axis=1) + names_events.append("Number of jets full selection cleaning 1 lepton") + values_events.append(ak.sum(njets_one_cleaning >= 2)) + ################################################################################### + jsel_two_cleaning = (((abs(jets.eta) < 2.4) & (jets.pt > 20) & (jets.puId > 0)) \ + | ((jets.pt>50) & (jets.jetId>5))) & ak.all(jets.metric_table(ll_cand.lep2)>0.4, axis =2) + njets_two_cleaning = ak.sum(jsel_two_cleaning,axis=1) + names_events.append("Number of jets full selection cleaning 2 lepton") + values_events.append(ak.sum(njets_two_cleaning >= 2)) + ################################################################################### + + # Count how many jets exist that pass this selection njet = ak.sum(jet_conditions,axis=1) selection.add('jetsel',ak.to_numpy(njet>=2)) + names_events.append("Number of jet events final cut") + values_events.append(ak.sum((njet >= 2))) # ================================================================================= @@ -1263,6 +1454,9 @@ def res(mval, out): selection.add('CR_t_tbar_2LH',ak.to_numpy(req_cr_t_tbar_vpt_high)) + with open(f"./{folder_save}/event_nr.txt", "a") as myfile: + myfile.write(f"Nr of events in {filename} from {start} to {stop}: " + str(nEvents) + " " + '\n') + myfile.write(f"Nr of muon events in {filename} from {start} to {stop} with pt, eta, looseId, iso cuts : " + str(nEvent_mu) + " " + str(n_event_mu_pt) + " " + str(n_event_mu_eta) + " " + str(n_event_mu_looseId) + " " + str(n_event_mu_iso) + " " + str(n_event_mu_pt) + ' ' + str(n_event_mu_sel) + " " + '\n') @@ -1645,6 +1839,37 @@ def res(mval, out): df_wei = pd.DataFrame([], columns = ['weights']) df_wei['weights'] = list_weights weight = np.array(list_weights) + + try: + df_muons = pd.read_csv(f'{folder_save}/muons.csv') + except FileNotFoundError: + df_muons = pd.DataFrame([], columns = ['pt', 'looseid', 'looseid_cut']) + df_muons_this_file = pd.DataFrame([], columns = ['pt', 'looseid', 'looseid_cut']) + df_muons_this_file['looseid'] = pd.Series(np.array(ak.ravel(event_mu.looseId))) + df_muons_this_file['looseid_cut'] = pd.Series(np.array(ak.ravel((event_mu.looseId>2)))) + df_muons_this_file['pt'] = pd.Series(np.array(ak.ravel(event_mu.pt))) + df_muons = pd.concat([df_muons, df_muons_this_file], ignore_index = True) + df_muons.to_csv(f'{folder_save}/muons.csv', sep=',', encoding='utf-8', index=False) + + try: + df_cutflow = pd.read_csv(f'{folder_save}/cutflow.csv') + except FileNotFoundError: + df_cutflow = pd.DataFrame([], columns = names_events) + + + + + elements_start = df_cutflow["Start"] + df_cutflow.loc[f"{filename}_{start}_{stop}"] = values_events + if "Sum" in elements_start.values: + df_cutflow = df_cutflow[:-2] + df_cutflow.loc[f"{filename}_{start}_{stop}"] = values_events + df_cutflow.loc[f"Sum over file"] = [np.sum(df_cutflow[name]) if name not in names_events[:3] else "Sum" for name in names_events] + elif "Sum" not in elements_start.values: + df_cutflow.loc[f"Sum over file"] = [np.sum(df_cutflow[name]) if name not in names_events[:3] else "Sum" for name in names_events] + + + df_cutflow.to_csv(f'{folder_save}/cutflow.csv', sep=',', encoding='utf-8', index=False) #df_weights_full = pd.concat([df_weights, df_wei], ignore_index = True) @@ -1667,13 +1892,13 @@ def res(mval, out): for var in lists_of_vars.keys(): try: - else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy') + else_var_array = np.load(f'{folder_save}/{dataset}/{filename}/test_{var}__{start}_{stop}_full.npy') except FileNotFoundError: else_var_array = np.array([]) finally: else_v_curr_array = np.array(lists_of_vars[var]) else_var_full_array = np.concatenate((else_var_array, else_v_curr_array), axis = None) - np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_full.npy', else_var_full_array, allow_pickle = False) + np.save(f'{folder_save}/{dataset}/{filename}/test_{var}_{start}_{stop}_full.npy', else_var_full_array, allow_pickle = False) #df_else_full = pd.concat([df_else_everything, df_else], ignore_index = True) From 24e7e9585440b61bbe7b77b6b3fb000b055256f6 Mon Sep 17 00:00:00 2001 From: ValVau <109755950+ValVau@users.noreply.github.com> Date: Thu, 9 Nov 2023 14:22:22 +0100 Subject: [PATCH 3/3] Updated xgb_files --- xgb_test_data_DATA_no_coffea_chi2.py | 861 ++++++++++++ xgb_test_no_coffea.py | 231 ++- xgb_test_no_coffea_chi2.py | 778 +++++++++++ xgb_test_no_coffea_diff_bgs.py | 776 +++++++++++ xgb_test_no_coffea_diff_bgs_DATA.py | 793 +++++++++++ xgb_test_no_coffea_diff_bgs_DATA_scale.py | 808 +++++++++++ ...st_no_coffea_diff_bgs_DATA_scale_pandas.py | 1239 +++++++++++++++++ ...a_diff_bgs_DATA_scale_pandas_numpy_test.py | 813 +++++++++++ xgb_test_only_xgb_no_coffea.py | 51 +- xgb_test_only_xgb_no_coffea_diff_bgs.py | 416 ++++++ xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py | 417 ++++++ ...st_only_xgb_no_coffea_diff_bgs_all etas.py | 530 +++++++ ...only_xgb_no_coffea_diff_bgs_full_bg_set.py | 418 ++++++ xgb_test_only_xgb_reloaded_no_coffea.py | 22 +- xgb_test_only_xgb_reloaded_no_coffea_var.py | 69 +- xgb_test_only_xgb_reloaded_no_coffea_vars.py | 521 +++++++ ...est_only_xgb_reloaded_no_coffea_vars_bg.py | 524 +++++++ ..._xgb_reloaded_no_coffea_vars_bg_multibg.py | 525 +++++++ ...d_no_coffea_vars_bg_multibg_full_bg_set.py | 589 ++++++++ 19 files changed, 10277 insertions(+), 104 deletions(-) create mode 100644 xgb_test_data_DATA_no_coffea_chi2.py create mode 100644 xgb_test_no_coffea_chi2.py create mode 100644 xgb_test_no_coffea_diff_bgs.py create mode 100644 xgb_test_no_coffea_diff_bgs_DATA.py create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale.py create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py create mode 100644 xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs.py create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py create mode 100644 xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py create mode 100644 xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py diff --git a/xgb_test_data_DATA_no_coffea_chi2.py b/xgb_test_data_DATA_no_coffea_chi2.py new file mode 100644 index 0000000..b9dac73 --- /dev/null +++ b/xgb_test_data_DATA_no_coffea_chi2.py @@ -0,0 +1,861 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_08' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_scale") +if not os.path.exists(f"./plot/{folder_save}/Big_scale"): + os.mkdir(f"./plot/{folder_save}/Big_scale") +if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False) +#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +data_path = 'condor_back_07_early/' +#paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np_back)TTTo2L2Nu_vau_bg +print(len(paths_np_back)) +df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_back_full_np) + +key_np_back = {} +for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) +#print(key_np_back) +for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + +#print(key_np_back) + +max_length_back = 0 +key_np_full_back = {} +for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) +for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) +#print(key_np_full_back) + +for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_back_full_np) +df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] +df_b_new_np = df_b_full_np.dropna() +print(df_b_new_np) + +len_var = [] +for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) +print(df_b_new_np) +df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### +data_path = 'condor_back_08_early/' +datas = ["Run2017B_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau"] #"Run2017C_DoubleMu_vau" +df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) +for data in datas: + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data)) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + key_np_data[key] = [np.load(element) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{roi}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) +df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### + +df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +c = 0 + +df_hists = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +for col in names_sig_data[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + + data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}'])) + df_hists[f'{col}_{roi}'] = np.array(counts22) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + # + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 50, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label= 'tt bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), + linestyle = "None", + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised for sig/bg)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Smaller scale #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%9 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%10 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Larger scale ############################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%4 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%5 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Smaller scale but not that small ################################################################ + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%7 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%8 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg") + + c += 1 + +df_hists.to_csv(f'./plot/{folder_save}/hists_{roi}.csv', sep=',', encoding='utf-8', index=False) + +def gaussian(x, height, center, width, offset): + return height*np.exp(-(x-center)**2/(2*width**2)) + offset + +def gaussiansin(x, height, center, width, offset, k, w): + return height*np.exp(-(x-center)**2/(2*width**2)) + offset + k*np.sin(x*w) + +def chiq2_gauss(x,y,sig,N,a): + chiq1 = 0 + for i in range(0,N): + chiq1 += ((y[i]-gaussian(x[i], a[0], a[1], a[2], a[3]))/sig[i])**2 + chiq1 = chiq1/(N-4) + return chiq1 + +def chiq2_gausssin(x,y,sig,N,a): + chiq1 = 0 + for i in range(0,N): + chiq1 += ((y[i]-gaussiansin(x[i], a[0], a[1], a[2], a[3], a[4], a[5]))/sig[i])**2 + chiq1 = chiq1/(N-6) + return chiq1 + +import scipy +counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + +counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80) + +from scipy.fft import fft, fftfreq +from scipy import stats + +yf = fft(counts22) + +sampling_rate = 40 + +xf = fftfreq(sampling_rate*2, 1/ sampling_rate) + +plt.figure(figsize = (13,8)) +plt.plot(xf, np.abs(yf)) +plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.pdf") +plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.jpg") + +popt_s ,pcov_s = scipy.optimize.curve_fit(gaussiansin, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100, 1, 12]) + +popt_g ,pcov_g = scipy.optimize.curve_fit(gaussian, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100]) + +print("params gauss: ", popt_g) +print("params gauss + sin : ", popt_s) + +print('\n Chi^2/dof of gauss sine is', chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s)) +print('\n Chi^2 of gauss sine is', 6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s)) + +print('\n Chi^2/dof of gauss peak is', chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g)) +print('\n Chi^2 of gauss peak is', 4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g)) + +p_val_sin = 1- stats.chi2.cdf(x=6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s), df=len(counts22)-6) +p_val_gauss = 1- stats.chi2.cdf(x=4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g), df=len(counts22)-4) + +print("p-value of gauss is: ", p_val_gauss, 1-p_val_gauss) +print("p-value of gauss + sin is: ", p_val_sin, 1- p_val_sin) +## plot compare list +def plot_data(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes): + xlin = np.linspace(0, 3.2) + # Plot measurements and fitted parabola + axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg') + axes[0].plot(xlin, gaussian(xlin, *params), color='red', label='Fitted gaussian') + axes[0].set_xlabel(x_label) + axes[0].set_xlim(0, 3.2) + axes[0].set_ylabel(y_label) + axes[0].set_ylim(ylims[0], ylims[1]) + axes[0].legend() + axes[0].grid(True) + # Plot residuals + axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='') + axes[1].axhline(0, color='red', linestyle='--') + axes[1].set_xlabel(x_label) + axes[1].set_ylabel('Residuals') + axes[1].grid(True) + # Plot pulls + axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='') + axes[2].axhline(0, color='red', linestyle='--') + axes[2].set_xlabel(x_label) + axes[2].set_ylabel('Pulls') + axes[2].grid(True) + +def plot_data_sin(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes): + xlin = np.linspace(0, 3.2) + # Plot measurements and fitted parabola + axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg') + axes[0].plot(xlin, gaussiansin(xlin, *params), color='red', label='Fitted gaussian + sin') + axes[0].set_xlabel(x_label) + axes[0].set_xlim(0, 3.2) + axes[0].set_ylabel(y_label) + axes[0].set_ylim(ylims[0], ylims[1]) + axes[0].legend() + axes[0].grid(True) + # Plot residuals + axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='') + axes[1].axhline(0, color='red', linestyle='--') + axes[1].set_xlabel(x_label) + axes[1].set_ylabel('Residuals') + axes[1].grid(True) + # Plot pulls + axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='') + axes[2].axhline(0, color='red', linestyle='--') + axes[2].set_xlabel(x_label) + axes[2].set_ylabel('Pulls') + axes[2].grid(True) + +error_count = np.sqrt(np.array(counts22)) +res_gauss = np.array(counts22) - gaussian(bins22[:-1], *popt_g) +res_gauss_sin = np.array(counts22) - gaussiansin(bins22[:-1], *popt_s) + +pulls_gauss = res_gauss/error_count +pulls_gauss_sin = res_gauss_sin/error_count +pulls_err_gauss = np.sqrt(error_count**2)/error_count + +fig, axes = plt.subplots(3, 2, figsize=(10, 8), sharex=True) +yAxisRange = [0, 400] +# Plot the first column (existing data) +plot_data(bins22[:-1], counts22, error_count, popt_g, res_gauss, error_count, pulls_gauss, pulls_err_gauss, 'x', 'y', yAxisRange, axes[:, 0]) +# Plot the second column (strange data) +plot_data_sin(bins22[:-1], counts22, error_count, popt_s, res_gauss_sin, error_count, pulls_gauss_sin, pulls_err_gauss, 'x', 'y (+sin)', yAxisRange, axes[:, 1]) +# Adjust spacing between subplots +fig.subplots_adjust(hspace=0) +fig.subplots_adjust(wspace=0.3) +#plt.show() + +fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.pdf") +fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.jpg") + +X = df.drop("target", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea.py b/xgb_test_no_coffea.py index 16547aa..f16bc8d 100644 --- a/xgb_test_no_coffea.py +++ b/xgb_test_no_coffea.py @@ -12,9 +12,15 @@ ) net_path = "/net/scratch_cms3a/vaulin/" -folder_save = 'eval_23_04_19_later' +folder_save = 'eval_23_08_02' if not os.path.exists(f"./plot/{folder_save}"): os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_scale") +if not os.path.exists(f"./plot/{folder_save}/Big_scale"): + os.mkdir(f"./plot/{folder_save}/Big_scale") +if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale") if not os.path.exists(net_path + f"plot/{folder_save}"): os.mkdir(net_path + f"plot/{folder_save}") def autoranger(array): @@ -35,11 +41,12 @@ def autoranger(array): 'del_phi_l2_subleading', 'del_phi_l2_leading'] roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] -roi = 'low_mumu' +roi = 'high_mumu' ###################################################################################### ##### Read np arrays of signal sample ################################################ ###################################################################################### -paths_np = [str(x) for x in Path("./condor_signal_04_mid/ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] #print(paths_np) print(len(paths_np)) df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) @@ -54,23 +61,38 @@ def autoranger(array): for path in paths_np: if f'{col}_{rois}' in path: key_np[f'{col}_{rois}'].append(path) -#print(key_np) + for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) key_np[key] = [np.load(element) for element in key_np[key]] -#print(key_np) + #print(key) + +print(key_np) key_np_full = {} +max_length = 0 for col in names_sig: for rois in roiis: key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) for key in key_np_full.keys(): key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) -#print(key_np_full) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) for key in key_np_full.keys(): - df_sig_full_np[key] = pd.Series(key_np_full[key]) + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) print(df_sig_full_np) df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + df_s_new_np = df_s_new_np.dropna() print(df_s_new_np) len_var = [] @@ -78,14 +100,21 @@ def autoranger(array): len_var.append(len(df_s_new_np[f'{col}_{roi}'])) df_s_new_np['target'] = np.ones(np.max(len_var)) print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False) +#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') ###################################################################################### ###################################################################################### ##### Read np arrays of background sample ############################################ ###################################################################################### -paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] -#print(paths_np_back) +data_path = 'condor_back_07_early/' +paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np_back)TTTo2L2Nu_vau_bg print(len(paths_np_back)) df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) print(df_back_full_np) @@ -101,22 +130,31 @@ def autoranger(array): key_np_back[f'{col}_{rois}'].append(path) #print(key_np_back) for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + #print(key_np_back) +max_length_back = 0 key_np_full_back = {} for col in names_sig: for rois in roiis: key_np_full_back[f'{col}_{rois}'] = np.array([]) for key in key_np_full_back.keys(): key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) #print(key_np_full_back) for key in key_np_full_back.keys(): - df_back_full_np[key] = pd.Series(key_np_full_back[key]) + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) print(df_back_full_np) -df_b_new_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] -df_b_new_np = df_b_new_np.dropna() +df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] +df_b_new_np = df_b_full_np.dropna() print(df_b_new_np) len_var = [] @@ -124,9 +162,9 @@ def autoranger(array): len_var.append(len(df_b_new_np[f'{col}_{roi}'])) df_b_new_np['target'] = np.zeros(np.max(len_var)) print(df_b_new_np) - +df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False) ###################################################################################### - +folder_save = 'eval_23_08_02' df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True) print(df) print(df.info()) @@ -142,6 +180,8 @@ def autoranger(array): '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] c = 0 + + for col in names_sig[1:]: plt.figure(figsize=(10,10)) @@ -170,9 +210,7 @@ def autoranger(array): plt.title(f'{names_sig_updated[c]}_low_ee') plt.legend(['Signal', 'Background']) #plt.show() - plt.savefig(net_path +f"plot/{folder_save}/{col}_{roi}.jpg") - - + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") fig, ((ax), (rax)) = plt.subplots( 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True @@ -220,7 +258,8 @@ def autoranger(array): counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) ratio = np.divide(counts1, counts2, where = (counts2 != 0)) - plt.plot(bins1[:-1], ratio, 'ko') + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') @@ -255,31 +294,31 @@ def autoranger(array): if "norm" in config.keys() and config["norm"]: logext = "_norm" + logext ''' - fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.pdf") - fig.savefig(net_path +f"plot/{folder_save}/compare_{col}_{roi}.jpg") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") ###################################################################################################### - #### No rescaling #################################################################################### + #### Smaller scale #################################################################################### ###################################################################################################### fig, ((ax), (rax)) = plt.subplots( 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True ) fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) - counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) - counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160) ## plot reference hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])), label= 'Higgs -> cc', histtype="step", color='r', yerr=True, ax=ax, - + density = True, ) for i in range(0, len(bins2)-1): x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] @@ -288,26 +327,27 @@ def autoranger(array): x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] y_pos = counts2[i] + (counts2[i] * 0.01) label_p = str(counts22[i]) - if i%5 == 0: + if i%9 == 0: ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') - if i%6 == 0: + if i%10 == 0: ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') ## plot compare list hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])), label='DY bg', histtype="step", color='g', yerr=True, ax=ax, - + density = True, ) # plot ratio of com/Ref - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) ratio = np.divide(counts1, counts2, where = (counts2 != 0)) - plt.plot(bins1[:-1], ratio, 'ko') + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') @@ -342,31 +382,31 @@ def autoranger(array): if "norm" in config.keys() and config["norm"]: logext = "_norm" + logext ''' - fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.pdf") - fig.savefig(net_path +f"plot/{folder_save}/compare_no_dense_{col}_{roi}.jpg") + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg") ###################################################################################################### - #### No rescaling hist density ###################################################################### + #### Larger scale ############################################################################# ###################################################################################################### fig, ((ax), (rax)) = plt.subplots( 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True ) fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) - counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) - counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40) ## plot reference hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])), label= 'Higgs -> cc', histtype="step", color='r', yerr=True, ax=ax, - + density = True, ) for i in range(0, len(bins2)-1): x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] @@ -375,26 +415,27 @@ def autoranger(array): x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] y_pos = counts2[i] + (counts2[i] * 0.01) label_p = str(counts22[i]) - if i%5 == 0: + if i%4 == 0: ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') - if i%6 == 0: + if i%5 == 0: ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') ## plot compare list hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])), label='DY bg', histtype="step", color='g', yerr=True, ax=ax, - + density = True, ) # plot ratio of com/Ref - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) ratio = np.divide(counts1, counts2, where = (counts2 != 0)) - plt.plot(bins1[:-1], ratio, 'ko') + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') @@ -429,25 +470,25 @@ def autoranger(array): if "norm" in config.keys() and config["norm"]: logext = "_norm" + logext ''' - fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.pdf") - fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_{col}_{roi}.jpg") + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg") ###################################################################################################### - #### No rescaling hist density True ################################################################# + #### Smaller scale but not that small ################################################################ ###################################################################################################### fig, ((ax), (rax)) = plt.subplots( 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True ) fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) - counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) - counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120) ## plot reference hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])), label= 'Higgs -> cc', histtype="step", color='r', @@ -462,13 +503,13 @@ def autoranger(array): x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] y_pos = counts2[i] + (counts2[i] * 0.01) label_p = str(counts22[i]) - if i%5 == 0: + if i%7 == 0: ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') - if i%6 == 0: + if i%8 == 0: ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') ## plot compare list hep.histplot( - np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])), label='DY bg', histtype="step", color='g', @@ -478,10 +519,11 @@ def autoranger(array): ) # plot ratio of com/Ref - counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) - counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) ratio = np.divide(counts1, counts2, where = (counts2 != 0)) - plt.plot(bins1[:-1], ratio, 'ko') + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') @@ -505,6 +547,7 @@ def autoranger(array): hep.mpl_magic(ax=ax) ax.set_ylim(bottom=0) + logext = "" ''' # log y axis @@ -516,10 +559,62 @@ def autoranger(array): if "norm" in config.keys() and config["norm"]: logext = "_norm" + logext ''' - fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.pdf") - fig.savefig(net_path +f"plot/{folder_save}/compare_np_dense_true_{col}_{roi}.jpg") + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg") c += 1 + + +fig, ((ax), (rax)) = plt.subplots( +2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True +) +fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) +hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) +counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + +counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80) +## plot compare list +hep.histplot( + np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins =80), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = False, ) + # plot ratio of com/Ref + +counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) +#ratio = np.divide(counts1, counts2, where = (counts2 != 0)) +#sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) +#plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') +#plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + +## plot settings, adjust range +rax.set_xlabel(f'$\Delta\Phi(j1, j2)$ {roi}') +ax.set_xlabel(None) +ax.set_ylabel("Events (normalised)") +rax.set_ylabel('$\\frac{Signal}{Background}$') +ax.ticklabel_format(style="sci", scilimits=(-3, 3)) +ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) +ax.legend() +rax.set_ylim(0.0, 2.0) +xmin, xmax, maxval, minval = autoranger(np.array(df[f'del_phi_jj_{roi}'][:len_sig])) +rax.set_xlim(minval, maxval) +at = AnchoredText( + "", + loc=2, + frameon=False, + ) +ax.add_artist(at) +hep.mpl_magic(ax=ax) +ax.set_ylim(bottom=0) + +logext = "" + +fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf") +fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg") X = df.drop("target", axis = 1) print(X) @@ -594,12 +689,12 @@ def autoranger(array): plt.title('Importance plot') plt.legend(['']) #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") +plt.savefig(f"./plot/{folder_save}/importance.jpg") plt.figure(figsize=(17,12)) plot_tree(xgb_cl, fmap = 'feature_map.txt') plt.title('Decision tree graph') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show() diff --git a/xgb_test_no_coffea_chi2.py b/xgb_test_no_coffea_chi2.py new file mode 100644 index 0000000..3a89590 --- /dev/null +++ b/xgb_test_no_coffea_chi2.py @@ -0,0 +1,778 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_06_tt' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_scale") +if not os.path.exists(f"./plot/{folder_save}/Big_scale"): + os.mkdir(f"./plot/{folder_save}/Big_scale") +if not os.path.exists(f"./plot/{folder_save}/Small_but_not_that_small_scale"): + os.mkdir(f"./plot/{folder_save}/Small_but_not_that_small_scale") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'high_ee' +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal.csv', sep=',', encoding='utf-8', index=False) +#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +data_path = 'condor_back_07_early/' +#paths_np_back = [str(x) for x in Path(data_path + "DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +paths_np_back = [str(x) for x in Path(data_path + "TTTo2L2Nu_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np_back)TTTo2L2Nu_vau_bg +print(len(paths_np_back)) +df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_back_full_np) + +key_np_back = {} +for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) +#print(key_np_back) +for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + +#print(key_np_back) + +max_length_back = 0 +key_np_full_back = {} +for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) +for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) +#print(key_np_full_back) + +for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_back_full_np) +df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] +df_b_new_np = df_b_full_np.dropna() +print(df_b_new_np) + +len_var = [] +for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) +print(df_b_new_np) +df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### +df = pd.concat([df_s_new_np, df_b_new_np], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +c = 0 + +df_hists = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +for col in names_sig[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + df_hists[f'{col}_{roi}'] = np.array(counts22) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Smaller scale #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%9 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%10 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 160, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =160, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_scale/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Larger scale ############################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%4 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%5 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 40, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =40, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Big_scale/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### Smaller scale but not that small ################################################################ + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%7 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%8 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 120, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins = 120, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + sigratio = ratio * np.sqrt(np.where(counts1>0, (counts11/counts1**2) * 1/(np.sum(counts11))**(2) , 0) + np.where(counts2>0, (counts22/counts2**2) * 1/(np.sum(counts22))**(2), 0)) + plt.errorbar(bins1[:-1], ratio, yerr = np.abs(sigratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Small_but_not_that_small_scale/compare_{col}_{roi}.jpg") + + c += 1 + +df_hists.to_csv(f'./plot/{folder_save}/hists_{roi}.csv', sep=',', encoding='utf-8', index=False) + +def gaussian(x, height, center, width, offset): + return height*np.exp(-(x-center)**2/(2*width**2)) + offset + +def gaussiansin(x, height, center, width, offset, k, w): + return height*np.exp(-(x-center)**2/(2*width**2)) + offset + k*np.sin(x*w) + +def chiq2_gauss(x,y,sig,N,a): + chiq1 = 0 + for i in range(0,N): + chiq1 += ((y[i]-gaussian(x[i], a[0], a[1], a[2], a[3]))/sig[i])**2 + chiq1 = chiq1/(N-4) + return chiq1 + +def chiq2_gausssin(x,y,sig,N,a): + chiq1 = 0 + for i in range(0,N): + chiq1 += ((y[i]-gaussiansin(x[i], a[0], a[1], a[2], a[3], a[4], a[5]))/sig[i])**2 + chiq1 = chiq1/(N-6) + return chiq1 + +import scipy +counts2, bins2 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + +counts22, bins22 = np.histogram(np.array(df[f'del_phi_jj_{roi}'][len_sig:]),bins = 80) + +from scipy.fft import fft, fftfreq +from scipy import stats + +yf = fft(counts22) + +sampling_rate = 40 + +xf = fftfreq(sampling_rate*2, 1/ sampling_rate) + +plt.figure(figsize = (13,8)) +plt.plot(xf, np.abs(yf)) +plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.pdf") +plt.savefig(f"./plot/{folder_save}/compare_FFT_{roi}.jpg") + +popt_s ,pcov_s = scipy.optimize.curve_fit(gaussiansin, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100, 1, 12]) + +popt_g ,pcov_g = scipy.optimize.curve_fit(gaussian, bins22[:-1], counts22, sigma = np.sqrt(np.array(counts22)), absolute_sigma = True, p0= [100, 1.5, 0.5, 100]) + +print("params gauss: ", popt_g) +print("params gauss + sin : ", popt_s) + +print('\n Chi^2/dof of gauss sine is', chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s)) +print('\n Chi^2 of gauss sine is', 6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s)) + +print('\n Chi^2/dof of gauss peak is', chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g)) +print('\n Chi^2 of gauss peak is', 4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g)) + +p_val_sin = 1- stats.chi2.cdf(x=6*chiq2_gausssin(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_s), df=len(counts22)-6) +p_val_gauss = 1- stats.chi2.cdf(x=4*chiq2_gauss(bins22[:-1], counts22, np.sqrt(np.array(counts22)), len(bins22[:-1]), popt_g), df=len(counts22)-4) + +print("p-value of gauss is: ", p_val_gauss, 1-p_val_gauss) +print("p-value of gauss + sin is: ", p_val_sin, 1- p_val_sin) +## plot compare list +def plot_data(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes): + xlin = np.linspace(0, 3.2) + # Plot measurements and fitted parabola + axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg') + axes[0].plot(xlin, gaussian(xlin, *params), color='red', label='Fitted gaussian') + axes[0].set_xlabel(x_label) + axes[0].set_xlim(0, 3.2) + axes[0].set_ylabel(y_label) + axes[0].set_ylim(ylims[0], ylims[1]) + axes[0].legend() + axes[0].grid(True) + # Plot residuals + axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='') + axes[1].axhline(0, color='red', linestyle='--') + axes[1].set_xlabel(x_label) + axes[1].set_ylabel('Residuals') + axes[1].grid(True) + # Plot pulls + axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='') + axes[2].axhline(0, color='red', linestyle='--') + axes[2].set_xlabel(x_label) + axes[2].set_ylabel('Pulls') + axes[2].grid(True) + +def plot_data_sin(x, y, unc, params, residuals, residuals_errors, pulls, pulls_errors, x_label, y_label, ylims, axes): + xlin = np.linspace(0, 3.2) + # Plot measurements and fitted parabola + axes[0].errorbar(x, y, unc, linestyle='None', color='blue', fmt='.', label='DY bg') + axes[0].plot(xlin, gaussiansin(xlin, *params), color='red', label='Fitted gaussian + sin') + axes[0].set_xlabel(x_label) + axes[0].set_xlim(0, 3.2) + axes[0].set_ylabel(y_label) + axes[0].set_ylim(ylims[0], ylims[1]) + axes[0].legend() + axes[0].grid(True) + # Plot residuals + axes[1].errorbar(x, residuals, yerr=residuals_errors, color='green', capsize=3, fmt='.', ls='') + axes[1].axhline(0, color='red', linestyle='--') + axes[1].set_xlabel(x_label) + axes[1].set_ylabel('Residuals') + axes[1].grid(True) + # Plot pulls + axes[2].errorbar(x, pulls, yerr=pulls_errors, color='purple', capsize=3, fmt='.', ls='') + axes[2].axhline(0, color='red', linestyle='--') + axes[2].set_xlabel(x_label) + axes[2].set_ylabel('Pulls') + axes[2].grid(True) + +error_count = np.sqrt(np.array(counts22)) +res_gauss = np.array(counts22) - gaussian(bins22[:-1], *popt_g) +res_gauss_sin = np.array(counts22) - gaussiansin(bins22[:-1], *popt_s) + +pulls_gauss = res_gauss/error_count +pulls_gauss_sin = res_gauss_sin/error_count +pulls_err_gauss = np.sqrt(error_count**2)/error_count + +fig, axes = plt.subplots(3, 2, figsize=(10, 8), sharex=True) +yAxisRange = [0, 400] +# Plot the first column (existing data) +plot_data(bins22[:-1], counts22, error_count, popt_g, res_gauss, error_count, pulls_gauss, pulls_err_gauss, 'x', 'y', yAxisRange, axes[:, 0]) +# Plot the second column (strange data) +plot_data_sin(bins22[:-1], counts22, error_count, popt_s, res_gauss_sin, error_count, pulls_gauss_sin, pulls_err_gauss, 'x', 'y (+sin)', yAxisRange, axes[:, 1]) +# Adjust spacing between subplots +fig.subplots_adjust(hspace=0) +fig.subplots_adjust(wspace=0.3) +#plt.show() + +fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.pdf") +fig.savefig(f"./plot/{folder_save}/compare_del_phi_jj_chi_{roi}.jpg") + +X = df.drop("target", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea_diff_bgs.py b/xgb_test_no_coffea_diff_bgs.py new file mode 100644 index 0000000..04b661a --- /dev/null +++ b/xgb_test_no_coffea_diff_bgs.py @@ -0,0 +1,776 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_07_25_2' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/No_dense"): + os.mkdir(f"./plot/{folder_save}/No_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense"): + os.mkdir(f"./plot/{folder_save}/Np_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"): + os.mkdir(f"./plot/{folder_save}/Np_dense_True") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +#roi = 'low_ee' + +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False) +#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +data_path = 'condor_back_07_early/' +def bg_processor(bg, nr): + paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] + #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] + #print(paths_np_back) + print(len(paths_np_back)) + df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + key_np_back = {} + for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] + for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + + #print(key_np_back) + + max_length_back = 0 + key_np_full_back = {} + for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) + #print(key_np_full_back) + + for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] + df_b_new_np = df_b_full_np.dropna() + print(df_b_new_np) + + len_var = [] + for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) + df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var)) + print(df_b_new_np) + df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False) + return df_b_new_np, len(df_b_new_np['target']) + +df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1) +df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2) +df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3) +df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4) +df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5) +max_len_bg = 0 +for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]: + if max_len_bg < l: + max_len_bg = l + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### +data_path = 'condor_back_08_early/' +datas = ["Run2017B_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau"] #"Run2017C_DoubleMu_vau" +df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) +for data in datas: + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data)) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + key_np_data[key] = [np.load(element) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{roi}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) +df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### +###################################################################################### +#folder_save = 'eval_23_07_25_2' +df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True) +df = pd.concat([df, df_b_new_np_zz], ignore_index = True) +df = pd.concat([df, df_b_new_np_wz], ignore_index = True) +df = pd.concat([df, df_b_new_np_tt], ignore_index = True) +df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) +df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +#xsec_weights = [0.002342, 6077., 3.74, 6.419, 88.51, 0.00720] + +xsec_weights = [1 , 1, 1, 1, 1, 1] + +#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8') + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + + +c = 0 +for col in names_sig[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + label= 'ZH -> cc signal', + histtype="step", + color='r', + #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]), + yerr = True, + ax=ax, + density = True, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = True, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True) + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True) + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True) + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True) + + ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0)) + ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0)) + ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0)) + ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0)) + ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0)) + rax.plot(bins1[:-1], ratio_dy, 'go') + rax.plot(bins1[:-1], ratio_zz, 'yo') + rax.plot(bins1[:-1], ratio_wz, 'bo') + rax.plot(bins1[:-1], ratio_tt, 'mo') + rax.plot(bins1[:-1], ratio_zhtobb, 'co') + rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 4.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea_diff_bgs_DATA.py b/xgb_test_no_coffea_diff_bgs_DATA.py new file mode 100644 index 0000000..d0325c0 --- /dev/null +++ b/xgb_test_no_coffea_diff_bgs_DATA.py @@ -0,0 +1,793 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_22' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/No_dense"): + os.mkdir(f"./plot/{folder_save}/No_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense"): + os.mkdir(f"./plot/{folder_save}/Np_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"): + os.mkdir(f"./plot/{folder_save}/Np_dense_True") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +#roi = 'low_ee' + +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False) +#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +data_path = 'condor_back_07_early/' +def bg_processor(bg, nr): + paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] + #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] + #print(paths_np_back) + print(len(paths_np_back)) + df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + key_np_back = {} + for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] + for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + + #print(key_np_back) + + max_length_back = 0 + key_np_full_back = {} + for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) + #print(key_np_full_back) + + for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] + df_b_new_np = df_b_full_np.dropna() + print(df_b_new_np) + + len_var = [] + for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) + df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var)) + print(df_b_new_np) + df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False) + return df_b_new_np, len(df_b_new_np['target']) + +df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1) +df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2) +df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3) +df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4) +df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5) +max_len_bg = 0 +for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]: + if max_len_bg < l: + max_len_bg = l + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### +data_path = 'condor_back_08_early/' +datas = ["Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau", + "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017E_DoubleEG_vau", "Run2017F_DoubleEG_vau"] #"Run2017C_DoubleMu_vau" +df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) +for data in datas: + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data)) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + key_np_data[key] = [np.load(element) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + df_dat_full_np = df_data_full_np[[f'{col}_{roi}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{roi}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) +df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### +###################################################################################### +#folder_save = 'eval_23_07_25_2' +df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True) +df = pd.concat([df, df_b_new_np_zz], ignore_index = True) +df = pd.concat([df, df_b_new_np_wz], ignore_index = True) +df = pd.concat([df, df_b_new_np_tt], ignore_index = True) +df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) +df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +xsec_weights = [0.002342, 6077., 3.74, 6.419, 88.51, 0.00720] + +#xsec_weights = [1 , 1, 1, 1, 1, 1] + +#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8') + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + + +c = 0 +for col in names_sig_data[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}'])) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + label= 'ZH -> cc signal', + histtype="step", + color='r', + #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]), + yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True) + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True) + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True) + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True) + + ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0)) + ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0)) + ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0)) + ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0)) + ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0)) + rax.plot(bins1[:-1], ratio_dy, 'go') + rax.plot(bins1[:-1], ratio_zz, 'yo') + rax.plot(bins1[:-1], ratio_wz, 'bo') + rax.plot(bins1[:-1], ratio_tt, 'mo') + rax.plot(bins1[:-1], ratio_zhtobb, 'co') + rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 4.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale.py b/xgb_test_no_coffea_diff_bgs_DATA_scale.py new file mode 100644 index 0000000..847ec1d --- /dev/null +++ b/xgb_test_no_coffea_diff_bgs_DATA_scale.py @@ -0,0 +1,808 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_09_14' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/No_dense"): + os.mkdir(f"./plot/{folder_save}/No_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense"): + os.mkdir(f"./plot/{folder_save}/Np_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"): + os.mkdir(f"./plot/{folder_save}/Np_dense_True") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +#roi = 'low_ee' + +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +data_path = 'condor_signal_09_late/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + + +for r in roiis: + df_s_new_np = df_sig_full_np[[f'{col}_{r}' for col in names_sig]] + + + df_s_new_np = df_s_new_np.dropna() + print(df_s_new_np) + len_var = [] + for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{r}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) + df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{r}.csv', sep=',', encoding='utf-8', index=False) +print(df_s_new_np) +df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8') +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) +'''len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False) +'''#df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +data_path = 'condor_back_09_late/' +def bg_processor(bg, nr): + paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] + #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] + #print(paths_np_back) + print(len(paths_np_back)) + df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + key_np_back = {} + for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] + for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element, allow_pickle = True) for element in key_np_back[key]] + print(key) + + #print(key_np_back) + + max_length_back = 0 + key_np_full_back = {} + for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) + #print(key_np_full_back) + + for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + + for r in roiis: + df_b_full_np = df_back_full_np[[f'{col}_{r}' for col in names_sig]] + df_b_new_np = df_b_full_np.dropna() + print(df_b_new_np) + len_var = [] + for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{r}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) + df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var)) + print(df_b_new_np) + df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{r}.csv', sep=',', encoding='utf-8', index=False) + df_b_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8') + return df_b_new_np, len(df_b_new_np['target']) + +df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1) +df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2) +df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3) +df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4) +df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg", 5) +max_len_bg = 0 +for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]: + if max_len_bg < l: + max_len_bg = l + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### +data_path = 'condor_back_09_late/' +datas = ["Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", "Run2017F_DoubleMu_vau", + "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017E_DoubleEG_vau", "Run2017F_DoubleEG_vau"] #"Run2017C_DoubleMu_vau" +df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) +for data in datas: + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data)) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + key_np_data[key] = [np.load(element) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + + for r in roiis: + df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{r}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) + df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}.csv', sep=',', encoding='utf-8', index=False) +df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}.csv', sep=',', encoding='utf-8') +###################################################################################### +###################################################################################### +#folder_save = 'eval_23_07_25_2' +df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True) +df = pd.concat([df, df_b_new_np_zz], ignore_index = True) +df = pd.concat([df, df_b_new_np_wz], ignore_index = True) +df = pd.concat([df, df_b_new_np_tt], ignore_index = True) +df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) +df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +xsec_weights = [0.002342*(41480/3323082), 6077.*(41480/102863931), 3.74*(41480/19134840), + 6.419*(41480/18136498), 88.51*(41480/105859990), 0.00720*(41480/4337504)] + +#xsec_weights = [1 , 1, 1, 1, 1, 1] + +#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8') + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + + +c = 0 +for col in names_sig_data[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}'])) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + label= 'ZH -> cc signal', + histtype="step", + color='r', + #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]), + yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True) + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True) + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True) + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True) + + ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0)) + ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0)) + ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0)) + ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0)) + ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0)) + rax.plot(bins1[:-1], ratio_dy, 'go') + rax.plot(bins1[:-1], ratio_zz, 'yo') + rax.plot(bins1[:-1], ratio_wz, 'bo') + rax.plot(bins1[:-1], ratio_tt, 'mo') + rax.plot(bins1[:-1], ratio_zhtobb, 'co') + rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 4.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py new file mode 100644 index 0000000..7218d1d --- /dev/null +++ b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas.py @@ -0,0 +1,1239 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_10_16' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/No_dense"): + os.mkdir(f"./plot/{folder_save}/No_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense"): + os.mkdir(f"./plot/{folder_save}/Np_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"): + os.mkdir(f"./plot/{folder_save}/Np_dense_True") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +# ", high $p_t$ $Z \\rightarrow ee$" +# ", high $p_t$ $Z \\rightarrow \\mu\\mu$" +# ", low $p_t$ $Z \\rightarrow ee$" +# ", low $p_t$ $Z \\rightarrow \\mu\\mu$" +roi = 'low_mumu' +roi_latex = ", low $p_t$ $Z \\rightarrow \\mu\\mu$" +#roi = 'low_ee' + +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +''' +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False) +''' +df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +''' +data_path = 'condor_back_07_early/' +def bg_processor(bg, nr): + paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] + #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] + #print(paths_np_back) + print(len(paths_np_back)) + df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + key_np_back = {} + for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] + for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + + #print(key_np_back) + + max_length_back = 0 + key_np_full_back = {} + for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) + #print(key_np_full_back) + + for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] + df_b_new_np = df_b_full_np.dropna() + print(df_b_new_np) + + len_var = [] + for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) + df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var)) + print(df_b_new_np) + df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False) + return df_b_new_np, len(df_b_new_np['target']) + +''' +#df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1) +#df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2) +#df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3) +#df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4) +#df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5) + +df_b_new_np_dy, len_dy = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_zz, len_zz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_wz, len_wz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_tt, len_tt = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_zhtobb, len_zhtobb = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) + +max_len_bg = 0 +for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]: + if max_len_bg < l: + max_len_bg = l + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### + +data_path = 'condor_back_09_late/' +datas = ["Run2017F_DoubleMu_vau", "Run2017F_DoubleEG_vau", "Run2017D_DoubleEG_vau", "Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", + "Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017E_DoubleEG_vau"] +''' +for data in datas: + + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data), data) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + key_np_data[key] = [np.load(element, allow_pickle = True) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + for r in roiis: + df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{r}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) + df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}_{data}.csv', sep=',', encoding='utf-8', index=False) +''' +df_data_final = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) +for data in datas: + df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}_{data}.csv', sep=',', encoding='utf-8') + df_data_final = pd.concat([df_data_final, df_data], ignore_index = True) +print(df_data_final) +df_data_final.to_csv(f'./plot/{folder_save}/numpy_data_DATA_final_{roi}.csv', sep=',', encoding='utf-8', index=False) +###################################################################################### +###################################################################################### +#folder_save = 'eval_23_07_25_2' +df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True) +df = pd.concat([df, df_b_new_np_zz], ignore_index = True) +df = pd.concat([df, df_b_new_np_wz], ignore_index = True) +df = pd.concat([df, df_b_new_np_tt], ignore_index = True) +df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) +df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +strange_factor = 1.26 ###low_channel +#strange_factor = 1.22 ###high_channel +lumi = 41480*strange_factor +#lumi = 49810 +xsec_weights = [0.002342*(lumi/3323082), 6077.*(lumi/102863931), 3.74*(lumi/19134840), + 6.419*(lumi/18136498), 88.51*(lumi/105859990), 0.00720*(lumi/4337504)] +print(xsec_weights) +#xsec_weights = [1 , 1, 1, 1, 1, 1] + +#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8') + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H) [GeV]', '$p_t$(H) [GeV]', '$p_t$(Z) [GeV]', 'm(Z) [GeV]', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet [GeV]', '$p_t$ of $CvsL_{min}$ jet [GeV]', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +import scipy +c = 0 +for col in names_sig_data[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax) + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + print(f'{col}_{roi}', len_sig, len_dy) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}'])) + + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + label= 'ZH -> cc signal', + histtype="step", + color='r', + #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]), + yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True) + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True) + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True) + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True) + + ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0)) + ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0)) + ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0)) + ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0)) + ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0)) + rax.plot(bins1[:-1], ratio_dy, 'go') + rax.plot(bins1[:-1], ratio_zz, 'yo') + rax.plot(bins1[:-1], ratio_wz, 'bo') + rax.plot(bins1[:-1], ratio_tt, 'mo') + rax.plot(bins1[:-1], ratio_zhtobb, 'co') + rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 4.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax) + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + print(f'{col}_{roi}', len_sig, len_dy) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}'])) + counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig])) + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal', + label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)', + histtype="step", + color='r', + yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig), + #yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins) + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1] + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + #print(np.sqrt(counts2)) + + + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + + counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2] + + + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + + counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3] + + + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + + counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4] + + + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + + counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5] + + + sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6 + mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]] + unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2) + unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2) + + print('rest') + print( data_counts, sum_c, unc_sum) + #print(data_counts - sum_c) + #print((data_counts - sum_c)/sum_c) + ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0)) + unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2) + rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + #rax.plot(bins[:-1], ratio, 'ko') + rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black') + + def line(x, a, b): + return a*x + b + if col == 'Higgs_mass': + popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3]) + print(popt, np.sqrt(pcov)) + print(1+popt[1]) + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Data - MC}{MC}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(-0.4, 1.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(12, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax) + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + print(f'{col}_{roi}', len_sig, len_dy) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + if col == 'Higgs_mass': + bins = np.array([0 + i*((250)/15) for i in range(0, 16)]) + elif col == 'Higgs_pt': + bins = np.array([0 + i*((300)/20) for i in range(0, 21)]) + elif col == 'Z_pt': + bins = np.array([50 + i*((100)/50) for i in range(0, 51)]) + elif col == 'Z_mass': + bins = np.array([75 + i*((30)/50) for i in range(0, 51)]) + elif col == 'jjVptratio': + bins = np.array([0 + i*((2)/15) for i in range(0, 16)]) + elif col == 'CvsL_max': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsL_min': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsB_max': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsB_min': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'pt_lead': + bins = np.array([0 + i*((350)/25) for i in range(0, 26)]) + elif col == 'pt_sublead': + bins = np.array([0 + i*((350)/25) for i in range(0, 26)]) + elif col == 'del_phi_jjV': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_R_jj': + bins = np.array([0 + i*((5)/15) for i in range(0, 16)]) + elif col == 'del_eta_jj': + bins = np.array([0 + i*((3)/15) for i in range(0, 16)]) + elif col == 'del_phi_ll': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_eta_ll': + bins = np.array([0 + i*((2.6)/15) for i in range(0, 16)]) + elif col == 'del_phi_l2_leading': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_phi_l2_subleading': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}'])) + counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig])) + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal', + label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)', + histtype="step", + color='r', + yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig), + #yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + if col == 'Higgs_mass': + bins = np.array([0 + i*((250)/15) for i in range(0, 16)]) + elif col == 'Higgs_pt': + bins = np.array([0 + i*((300)/20) for i in range(0, 21)]) + elif col == 'Z_pt': + bins = np.array([50 + i*((100)/50) for i in range(0, 51)]) + elif col == 'Z_mass': + bins = np.array([75 + i*((30)/50) for i in range(0, 51)]) + elif col == 'jjVptratio': + bins = np.array([0 + i*((2)/15) for i in range(0, 16)]) + elif col == 'CvsL_max': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsL_min': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsB_max': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'CvsB_min': + bins = np.array([0 + i*((1)/20) for i in range(0, 21)]) + elif col == 'pt_lead': + bins = np.array([0 + i*((350)/25) for i in range(0, 26)]) + elif col == 'pt_sublead': + bins = np.array([0 + i*((350)/25) for i in range(0, 26)]) + elif col == 'del_phi_jjV': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_R_jj': + bins = np.array([0 + i*((5)/15) for i in range(0, 16)]) + elif col == 'del_eta_jj': + bins = np.array([0 + i*((3)/15) for i in range(0, 16)]) + elif col == 'del_phi_ll': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_eta_ll': + bins = np.array([0 + i*((2.6)/15) for i in range(0, 16)]) + elif col == 'del_phi_l2_leading': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + elif col == 'del_phi_l2_subleading': + bins = np.array([0 + i*((np.pi)/15) for i in range(0, 16)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 't$\\bar{t}$ bg', 'ZH -> b$\\bar{b}$ \n bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins) + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1] + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + #print(np.sqrt(counts2)) + + + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + + counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2] + + + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + + counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3] + + + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + + counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4] + + + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + + counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5] + + + sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6 + mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]] + unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2) + unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2) + + print('rest') + print( data_counts, sum_c, unc_sum) + #print(data_counts - sum_c) + #print((data_counts - sum_c)/sum_c) + ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0)) + unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2) + rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + #rax.plot(bins[:-1], ratio, 'ko') + rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black') + + def line(x, a, b): + return a*x + b + if col == 'Higgs_mass': + popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3]) + print(popt, np.sqrt(pcov)) + print(1+popt[1]) + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Data - MC}{MC}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + box = ax.get_position() + ax.set_position([box.x0, box.y0 , box.width*0.8, box.height]) + # Put a legend to the right of the current axis + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fancybox = True, shadow = True, ncols = 1, fontsize = 'x-small', labelspacing = 1.6) + + rax.set_ylim(-0.4, 1.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + if col == 'Higgs_mass': + minval, maxval = 0, 250 + elif col == 'Higgs_pt': + minval, maxval = 0, 300 + elif col == 'Z_pt': + minval, maxval = 50, 150 + elif col == 'Z_mass': + minval, maxval = 75, 105 + elif col == 'jjVptratio': + minval, maxval = 0, 2 + elif col == 'CvsL_max': + minval, maxval = 0, 1 + elif col == 'CvsL_min': + minval, maxval = 0, 1 + elif col == 'CvsB_max': + minval, maxval = 0, 1 + elif col == 'CvsB_min': + minval, maxval = 0, 1 + elif col == 'pt_lead': + minval, maxval = 0, 350 + elif col == 'pt_sublead': + minval, maxval = 0, 350 + elif col == 'del_phi_jjV': + minval, maxval = 0, np.pi + elif col == 'del_R_jj': + minval, maxval = 0, 5 + elif col == 'del_eta_jj': + minval, maxval = 0, 3 + elif col == 'del_phi_ll': + minval, maxval = 0, np.pi + elif col == 'del_eta_ll': + minval, maxval = 0, 2.6 + elif col == 'del_phi_l2_leading': + minval, maxval = 0, np.pi + elif col == 'del_phi_l2_subleading': + minval, maxval = 0, np.pi + rax.set_xlim(minval, maxval) + boxr = rax.get_position() + rax.set_position([boxr.x0, boxr.y0, boxr.width*0.8, boxr.height]) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(12, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13", data=True, loc=0, ax=ax) + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + print(f'{col}_{roi}', len_sig, len_dy) + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + if col == 'jjVptratio': + bins = np.array([0 + i*((2)/50) for i in range(0, 51)]) + data_counts, data_bins = np.histogram(np.array(df_data_final[f'{col}_{roi}']),bins =bins, weights = np.array(df_data_final[f'wei_{roi}'])) + counts_sig, bins_sig = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = np.array(df[f'wei_{roi}'][:len_sig])) + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = 1000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal', + label= 'ZH -> c$\\bar{c}$ \n signal \n ($\cdot 10^3$)', + histtype="step", + color='r', + yerr= 1000*xsec_weights[0]*np.sqrt(counts_sig), + #yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 50 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + if col == 'jjVptratio': + bins = np.array([0 + i*((2)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 't$\\bar{t}$ bg', 'ZH -> b$\\bar{b}$ \n bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = bins, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + #counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins) + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + counts2_dy_pure, bins2_dy_pure = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = bins, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])) + unc_dy_pure = np.sqrt(counts2_dy_pure)*xsec_weights[1] + #print('counts dy') + #print(counts2_dy_pure, np.sqrt(counts2_dy_pure)) + #print(np.sum(counts2_dy_pure)) + #print('unc dy rescaled') + #print(np.sqrt(counts2_dy_pure)*xsec_weights[1]) + #print(np.sqrt(counts2)) + + + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + + counts3_zz_pure, bins3_zz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])) + unc_zz_pure = np.sqrt(counts3_zz_pure)*xsec_weights[2] + + + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + + counts4_wz_pure, bins4_wz_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])) + unc_wz_pure = np.sqrt(counts4_wz_pure)*xsec_weights[3] + + + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + + counts5_tt_pure, bins5_tt_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])) + unc_tt_pure = np.sqrt(counts5_tt_pure)*xsec_weights[4] + + + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + + counts6_zhbb_pure, bins6_zhbb_pure = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = bins, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):])) + unc_zhbb_pure = np.sqrt(counts6_zhbb_pure)*xsec_weights[5] + + + sum_c = counts1 + counts2 + counts3 + counts4 + counts5 + counts6 + mc_arrays = [np.array(var) for var in [unc_dy_pure, unc_zz_pure, unc_wz_pure, unc_tt_pure, unc_zhbb_pure]] + unc_sum = np.sqrt(unc_dy_pure**2 + unc_zz_pure**2 + unc_wz_pure**2 + unc_tt_pure**2 + unc_zhbb_pure**2) + unc_data_minus_mc = np.sqrt(data_counts + unc_sum**2) + + print('rest') + print( data_counts, sum_c, unc_sum) + #print(data_counts - sum_c) + #print((data_counts - sum_c)/sum_c) + ratio = np.divide((data_counts - sum_c), sum_c, where = (sum_c != 0)) + unc_ratio = ratio*np.sqrt((np.divide(unc_data_minus_mc, (data_counts - sum_c), where = ((data_counts - sum_c) != 0)))**2 + (np.divide(unc_sum, sum_c, where = (sum_c != 0)))**2) + rax.errorbar((data_bins[:-1] + data_bins[1:])/2, ratio, yerr = np.abs(unc_ratio), color = "k", fmt = '.', marker = 'o', markeredgecolor = 'k') + #plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + #rax.plot(bins[:-1], ratio, 'ko') + rax.plot((data_bins[:-1] + data_bins[1:])/2, [0]*len(ratio), '--', color = 'black') + + def line(x, a, b): + return a*x + b + if col == 'Higgs_mass': + popt ,pcov = scipy.optimize.curve_fit(line, bins[:-1], ratio, sigma = np.abs(unc_ratio), absolute_sigma = True, p0= [0, 0.3]) + print(popt, np.sqrt(pcov)) + print(1+popt[1]) + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi_latex}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Data - MC}{MC}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + box = ax.get_position() + ax.set_position([box.x0, box.y0 , box.width*0.8, box.height]) + # Put a legend to the right of the current axis + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), fancybox = True, shadow = True, ncols = 1, fontsize = 'x-small', labelspacing = 1.6) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + + if col == 'jjVptratio': + minval, maxval = 0, 2 + boxr = rax.get_position() + rax.set_position([boxr.x0, boxr.y0, boxr.width*0.8, boxr.height]) + rax.set_xlim(minval, maxval) + rax.set_ylim(-0.4, 1.0) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py new file mode 100644 index 0000000..ec6fbfc --- /dev/null +++ b/xgb_test_no_coffea_diff_bgs_DATA_scale_pandas_numpy_test.py @@ -0,0 +1,813 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +from pathlib import Path +import os +from BTVNanoCommissioning.utils.plot_utils import ( + plotratio, + +) +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_09_14' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/No_dense"): + os.mkdir(f"./plot/{folder_save}/No_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense"): + os.mkdir(f"./plot/{folder_save}/Np_dense") +if not os.path.exists(f"./plot/{folder_save}/Np_dense_True"): + os.mkdir(f"./plot/{folder_save}/Np_dense_True") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +def autoranger(array): + val, axis = array, np.arange(0,len(array)+1) + for i in range(len(val)): + if val[i] != 0: + mins = i + break + for i in reversed(range(len(val))): + if val[i] != 0: + maxs = i + 1 + break + print(axis[mins], axis[maxs]) + return axis[mins], axis[maxs], np.max(val), np.min(val) +names_sig = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'Z_pt_gen', 'Z_mass_gen', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_sig_data = ['wei', 'Higgs_mass', 'Higgs_pt', 'Z_pt', 'Z_mass', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +roiis = ['high_mumu', 'high_ee', 'low_mumu', 'low_ee'] +roi = 'low_mumu' +#roi = 'low_ee' + +###################################################################################### +##### Read np arrays of signal sample ################################################ +###################################################################################### +''' +data_path = 'condor_signal_06_mid/' +paths_np = [str(x) for x in Path(data_path + "ZHToCC_vau_sig").glob("**/*.npy") if ("_full" in str(x))] +#print(paths_np) +print(len(paths_np)) +df_sig_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) + +key_np = {} +for col in names_sig: + for rois in roiis: + key_np[f'{col}_{rois}'] = [] +for col in names_sig: + for rois in roiis: + for path in paths_np: + if f'{col}_{rois}' in path: + key_np[f'{col}_{rois}'].append(path) + +for key in key_np.keys(): + #print(len(key_np[key]) == len(set(key_np[key]))) + key_np[key] = [np.load(element) for element in key_np[key]] + #print(key) + +print(key_np) + +key_np_full = {} +max_length = 0 +for col in names_sig: + for rois in roiis: + key_np_full[f'{col}_{rois}'] = np.array([]) +print(key_np_full) +for key in key_np_full.keys(): + key_np_full[key] = np.concatenate(tuple(key_np[key]), axis = None) + print(len(key_np_full[key])) + if max_length < len(key_np_full[key]): + max_length = len(key_np_full[key]) + +for key in key_np_full.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_sig_full_np[key] = list(np.append(key_np_full[key], np.repeat(np.nan, max_length- (len(key_np_full[key]))))) +#df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) +print(df_sig_full_np) +df_s_new_np = df_sig_full_np[[f'{col}_{roi}' for col in names_sig]] + +print(len(df_s_new_np[f"wei_{roi}"])) +our_aray_results = len(df_s_new_np[f"wei_{roi}"]) + + + +df_s_new_np = df_s_new_np.dropna() +print(df_s_new_np) +len_var = [] +for col in names_sig: + len_var.append(len(df_s_new_np[f'{col}_{roi}'])) + df_s_new_np['target'] = np.ones(np.max(len_var)) + df_s_new_np['target_bg'] = np.zeros(np.max(len_var)) +print(df_s_new_np) + + +df_s_new_np.to_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8', index=False) +''' +df_s_new_np = pd.read_csv(f'./plot/{folder_save}/numpy_data_signal_{roi}.csv', sep=',', encoding='utf-8') +###################################################################################### + + +###################################################################################### +##### Read np arrays of background sample ############################################ +###################################################################################### +''' +data_path = 'condor_back_09_late/' +def bg_processor(bg, nr): + paths_np_back = [str(x) for x in Path(data_path + f"{bg}").glob("**/*.npy") if ("_full" in str(x))] + #paths_np_back = [str(x) for x in Path("./condor_back_04_mid/DYJetsToLL_nlo_vau_bg").glob("**/*.npy") if ("_full" in str(x))] + #print(paths_np_back) + print(len(paths_np_back)) + df_back_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + + key_np_back = {} + for col in names_sig: + for rois in roiis: + key_np_back[f'{col}_{rois}'] = [] + for col in names_sig: + for rois in roiis: + for path in paths_np_back: + if f'{col}_{rois}' in path: + key_np_back[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_back.keys(): + print(len(key_np_back[key]) == len(set(key_np_back[key]))) + for element in key_np_back[key]: + print(element) + np.load(element, allow_pickle = True) + key_np_back[key] = [np.load(element) for element in key_np_back[key]] + print(key) + + #print(key_np_back) + + max_length_back = 0 + key_np_full_back = {} + for col in names_sig: + for rois in roiis: + key_np_full_back[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_back.keys(): + key_np_full_back[key] = np.concatenate(tuple(key_np_back[key]), axis = None) + print(len(key_np_full_back[key])) + if max_length_back < len(key_np_full_back[key]): + max_length_back = len(key_np_full_back[key]) + #print(key_np_full_back) + + for key in key_np_full_back.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_back_full_np[key] = list(np.append(key_np_full_back[key], np.repeat(np.nan, max_length_back- (len(key_np_full_back[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_back_full_np) + df_b_full_np = df_back_full_np[[f'{col}_{roi}' for col in names_sig]] + df_b_new_np = df_b_full_np.dropna() + print(df_b_new_np) + + len_var = [] + for col in names_sig: + len_var.append(len(df_b_new_np[f'{col}_{roi}'])) + df_b_new_np['target'] = np.zeros(np.max(len_var)) + df_b_new_np['target_bg'] = np.array([nr]*np.max(len_var)) + print(df_b_new_np) + df_b_new_np.to_csv(f'./plot/{folder_save}/numpy_data_bg_{bg}_{roi}.csv', sep=',', encoding='utf-8', index=False) + return df_b_new_np, len(df_b_new_np['target']) +''' + +#df_b_new_np_dy, len_dy = bg_processor("DYJetsToLL_nlo_vau_bg", 1) +#df_b_new_np_zz, len_zz = bg_processor("ZZTo2L2Q_vau_bg", 2) +#df_b_new_np_wz, len_wz = bg_processor("WZTo2Q2L_vau_bg", 3) +#df_b_new_np_tt, len_tt = bg_processor("TTTo2L2Nu_vau_bg", 4) +#df_b_new_np_zhtobb, len_zhtobb = bg_processor("ZH_HToBB_ZLL_vau_bg_old", 5) + +df_b_new_np_dy, len_dy = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_DYJetsToLL_nlo_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_zz, len_zz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZZTo2L2Q_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_wz, len_wz = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_WZTo2Q2L_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_tt, len_tt = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_TTTo2L2Nu_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) +df_b_new_np_zhtobb, len_zhtobb = pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8'), len(pd.read_csv(f'./plot/{folder_save}/numpy_data_bg_ZH_HToBB_ZLL_vau_bg_{roi}.csv', sep=',', encoding='utf-8')['target']) + +max_len_bg = 0 +for l in [len_dy, len_zz, len_wz, len_tt, len_zhtobb]: + if max_len_bg < l: + max_len_bg = l + +###################################################################################### +##### Read np arrays of data sample ################################################## +###################################################################################### +data_path = 'condor_back_09_late/' +datas = ["Run2017B_DoubleEG_vau", "Run2017C_DoubleEG_vau", "Run2017E_DoubleEG_vau"] #, "Run2017D_DoubleEG_vau", "Run2017F_DoubleEG_vau", "Run2017F_DoubleMu_vau", Run2017D_DoubleEG_vau", "Run2017B_DoubleMu_vau", "Run2017C_DoubleMu_vau", "Run2017D_DoubleMu_vau", "Run2017E_DoubleMu_vau", + +for data in datas: + + paths_np_data = [str(x) for x in Path(data_path + data).glob("**/*.npy") if ("_full" in str(x))] + + print(len(paths_np_data), data) + df_data_full_np = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + print(df_data_full_np) + + key_np_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_data[f'{col}_{rois}'] = [] + for col in names_sig_data: + for rois in roiis: + for path in paths_np_data: + if f'{col}_{rois}' in path: + key_np_data[f'{col}_{rois}'].append(path) + #print(key_np_back) + for key in key_np_data.keys(): + print(len(key_np_data[key]) == len(set(key_np_data[key]))) + for element in key_np_data[key]: + print(element) + np.load(element, allow_pickle = True) + key_np_data[key] = [np.load(element, allow_pickle = True) for element in key_np_data[key]] + print(key) + + #print(key_np_back) + + max_length_data = 0 + key_np_full_data = {} + for col in names_sig_data: + for rois in roiis: + key_np_full_data[f'{col}_{rois}'] = np.array([]) + for key in key_np_full_data.keys(): + key_np_full_data[key] = np.concatenate(tuple(key_np_data[key]), axis = None) + print(len(key_np_full_data[key])) + if max_length_data < len(key_np_full_data[key]): + max_length_data = len(key_np_full_data[key]) + #print(key_np_full_back) + + for key in key_np_full_data.keys(): + #df_sig_full_np[key] = pd.Series(key_np_full[key]) + df_data_full_np[key] = list(np.append(key_np_full_data[key], np.repeat(np.nan, max_length_data- (len(key_np_full_data[key]))))) + #df_sig_full_np = pd.DataFrame([pd.Series(key_np_full[key]) for key in key_np_full.keys()], columns = [f'{col}_{rois}' for col in names_sig for rois in roiis]) + print(df_data_full_np) + for r in roiis: + df_data = pd.DataFrame([], columns = [f'{col}_{rois}' for col in names_sig_data for rois in roiis]) + df_dat_full_np = df_data_full_np[[f'{col}_{r}' for col in names_sig_data]] + df_dat_new_np = df_dat_full_np.dropna() + print(df_dat_new_np) + len_var = [] + for col in names_sig_data: + len_var.append(len(df_dat_new_np[f'{col}_{r}'])) + df_dat_new_np['target'] = np.full(np.max(len_var), 2, dtype = int) + print(df_dat_new_np) + df_data = pd.concat([df_data, df_dat_new_np], ignore_index = True) + df_data.to_csv(f'./plot/{folder_save}/numpy_data_DATA_{r}.csv', sep=',', encoding='utf-8', index=False) +df_data = pd.read_csv(f'./plot/{folder_save}/numpy_data_DATA_{roi}.csv', sep=',', encoding='utf-8') +###################################################################################### +###################################################################################### +#folder_save = 'eval_23_07_25_2' +df = pd.concat([df_s_new_np, df_b_new_np_dy], ignore_index = True) +df = pd.concat([df, df_b_new_np_zz], ignore_index = True) +df = pd.concat([df, df_b_new_np_wz], ignore_index = True) +df = pd.concat([df, df_b_new_np_tt], ignore_index = True) +df = pd.concat([df, df_b_new_np_zhtobb], ignore_index = True) +print(df) +print(df.info()) +df.to_csv(net_path + f'/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) +df.to_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8', index=False) + +xsec_weights = [0.002342*(41480/3323082), 6077.*(41480/102863931), 3.74*(41480/19134840), + 6.419*(41480/18136498), 88.51*(41480/105859990), 0.00720*(41480/4337504)] + +#xsec_weights = [1 , 1, 1, 1, 1, 1] + +#df = pd.read_csv(f'xgb_training_dataset_{roi}.csv', sep=',', encoding='utf-8') + +print("% of negative weights: " + str(len(df[f"wei_{roi}"][df[f"wei_{roi}"]<0])/len(df[f"wei_{roi}"]))) + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) +names_sig_updated = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$p_t$($Z_{gen}$)', 'm($Z_{gen}$)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + +names_sig_updated_data = ['m(H)', '$p_t$(H)', '$p_t$(Z)', 'm(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] + + +c = 0 +for col in names_sig_data[1:]: + + plt.figure(figsize=(10,10)) + len_sig = 0 + for i in range(0,len(df['target'])): + if df['target'][i] == 1: + len_sig += 1 + print(len_sig) + names_big_ax = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'pt_lead', 'pt_sublead'] + if col in names_big_ax: + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 180).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + else: + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][:len_sig])).plot() + hist.Hist.new.Regular(150, 0, 5).Double().fill(np.array(df[f'{col}_{roi}'][len_sig:])).plot() + if 'pt' in col: + if 'ratio' not in col: + plt.xlabel('$p_t$ in Gev') + else: + plt.xlabel('') + elif 'mass' in col: + plt.xlabel('Mass in Gev') + else: + plt.xlabel('') + plt.ylabel("Counts") + plt.title(f'{names_sig_updated[c]}_low_ee') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/{col}_{roi}.jpg") + + + + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + data_counts, data_bins = np.histogram(np.array(df_data[f'{col}_{roi}']),bins =50, weights = np.array(df_data[f'wei_{roi}'])) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins =80) + ## plot reference + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + + hep.histplot( + #np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + #label= 'ZH -> cc signal $\cdot 10^5$', + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 50, weights = xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig])), + label= 'ZH -> cc signal', + histtype="step", + color='r', + #yerr= np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 25, weights = 100000*xsec_weights[0]*np.array(df[f'wei_{roi}'][:len_sig]))[0]), + yerr = True, + ax=ax, + density = False, + ) + #for i in range(0, len(bins2)-1): + # x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + # y_pos_sig = counts1[i] + (counts1[i] * 0.01) + # label_p_sig = str(counts11[i]) + # x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + # y_pos = counts2[i] + (counts2[i] * 0.01) + # label_p = str(counts22[i]) + # if i%5 == 0: + # ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + # if i%6 == 0: + # ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + n_bins = 80 + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + bins = np.array([minval + i*((maxval-minval)/50) for i in range(0, 51)]) + print(bins) + hep.histplot( + [np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)])), + np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))], + stack = True, + label=['DY bg', 'ZZ bg', 'WZ bg', 'tt bg', 'ZH -> bb bg'], + histtype="fill", + color=['g', 'y', 'b', 'm', 'c'], + #bins = np.arange(80), + yerr = True, + #yerr= [np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]), bins = bins, weights = xsec_weights[1]*np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]))[0]), + # np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), bins = bins, weights = xsec_weights[2]*np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), bins = bins, weights = xsec_weights[3]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), bins = bins, weights = xsec_weights[4]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]))[0]), + #np.sqrt(np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), bins = bins, weights = xsec_weights[5]*np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]))[0])], + ax=ax, + density = False, + alpha = [0.3, 0.3, 0.3, 0.3, 0.3], + edgecolor = ["k", "k", "k", "k", "k"], + + ) + + ## plot compare list + ax.errorbar( + (data_bins[:-1] + data_bins[1:])/2, + np.array(data_counts), + label='Data', + marker = 'o', + color='k', + yerr=np.sqrt(np.array(data_counts)), #*(1/np.sum(data_counts)) + linestyle = "None", + ) + + # plot ratio of com/Ref + nbinning = 50 + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:(len_sig+len_dy)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][len_sig:(len_sig+len_dy)]), density = True) + counts3, bins3 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy):(len_sig+len_dy+len_zz)]), density = True) + counts4, bins4 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz):(len_sig+len_dy+len_zz+len_wz)]), density = True) + counts5, bins5 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz):(len_sig+len_dy+len_zz+len_wz+len_tt)]), density = True) + counts6, bins6 = np.histogram(np.array(df[f'{col}_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]),bins = nbinning, weights = np.array(df[f'wei_{roi}'][(len_sig+len_dy+len_zz+len_wz+len_tt):]), density = True) + + ratio_dy = np.divide(counts1, counts2, where = (counts2 != 0)) + ratio_zz = np.divide(counts1, counts3, where = (counts3 != 0)) + ratio_wz = np.divide(counts1, counts4, where = (counts4 != 0)) + ratio_tt = np.divide(counts1, counts5, where = (counts5 != 0)) + ratio_zhtobb = np.divide(counts1, counts6, where = (counts6 != 0)) + rax.plot(bins1[:-1], ratio_dy, 'go') + rax.plot(bins1[:-1], ratio_zz, 'yo') + rax.plot(bins1[:-1], ratio_wz, 'bo') + rax.plot(bins1[:-1], ratio_tt, 'mo') + rax.plot(bins1[:-1], ratio_zhtobb, 'co') + rax.plot(bins1[:-1], [1]*len(ratio_dy), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated_data[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 4.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + #hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling #################################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig])), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:])), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/No_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density ###################################################################### + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense/compare_{col}_{roi}.jpg") + + ###################################################################################################### + #### No rescaling hist density True ################################################################# + ###################################################################################################### + fig, ((ax), (rax)) = plt.subplots( + 2, 1, figsize=(10, 10), gridspec_kw={"height_ratios": (3, 1)}, sharex=True + ) + fig.subplots_adjust(hspace=0.06, top=0.92, bottom=0.1, right=0.97) + hep.cms.label("Private work", com="13.6", data=True, loc=0, ax=ax) + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + + counts11, bins11 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, density = True) + counts22, bins22 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, density = True) + ## plot reference + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True), + label= 'Higgs -> cc', + histtype="step", + color='r', + yerr=True, + ax=ax, + density = True, + ) + for i in range(0, len(bins2)-1): + x_pos_sig = (bins1[i +1] - bins1[i])/4 + bins1[i] + y_pos_sig = counts1[i] + (counts1[i] * 0.01) + label_p_sig = str(counts11[i]) + x_pos = (bins2[i +1] - bins2[i])/4 + bins2[i] + y_pos = counts2[i] + (counts2[i] * 0.01) + label_p = str(counts22[i]) + if i%5 == 0: + ax.text(x_pos, y_pos, label_p, rotation = 'vertical', color = 'green') + if i%6 == 0: + ax.text(x_pos_sig, y_pos_sig, label_p_sig, rotation = 'vertical', color = 'red') + ## plot compare list + hep.histplot( + np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True), + label='DY bg', + histtype="step", + color='g', + yerr=True, + ax=ax, + density = True, + ) + # plot ratio of com/Ref + + counts1, bins1 = np.histogram(np.array(df[f'{col}_{roi}'][:len_sig]),bins = 80, weights = np.array(df[f'wei_{roi}'][:len_sig]), density = True) + counts2, bins2 = np.histogram(np.array(df[f'{col}_{roi}'][len_sig:]),bins =80, weights = np.array(df[f'wei_{roi}'][len_sig:]), density = True) + ratio = np.divide(counts1, counts2, where = (counts2 != 0)) + plt.plot(bins1[:-1], ratio, 'ko') + plt.plot(bins1[:-1], [1]*len(ratio), '--', color = 'black') + + + ## plot settings, adjust range + rax.set_xlabel(f'{names_sig_updated[c]} {roi}') + ax.set_xlabel(None) + ax.set_ylabel("Events (normalised)") + rax.set_ylabel('$\\frac{Signal}{Background}$') + ax.ticklabel_format(style="sci", scilimits=(-3, 3)) + ax.get_yaxis().get_offset_text().set_position((-0.065, 1.05)) + ax.legend() + rax.set_ylim(0.0, 2.0) + xmin, xmax, maxval, minval = autoranger(np.array(df[f'{col}_{roi}'][:len_sig])) + rax.set_xlim(minval, maxval) + at = AnchoredText( + "", + loc=2, + frameon=False, + ) + ax.add_artist(at) + hep.mpl_magic(ax=ax) + ax.set_ylim(bottom=0) + + logext = "" + ''' + # log y axis + if "log" in config.keys() and config["log"]: + ax.set_yscale("log") + logext = "_log" + ax.set_ylim(bottom=0.1) + hep.mpl_magic(ax=ax) + if "norm" in config.keys() and config["norm"]: + logext = "_norm" + logext + ''' + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.pdf") + fig.savefig(f"./plot/{folder_save}/Np_dense_True/compare_{col}_{roi}.jpg") + + c += 1 + +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + + + +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + +import xgboost as xgb + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', base_score = 0.5, learning_rate = 0.01, gamma = 1, reg_alpha = 0.2, reg_lambda = 0.2, n_estimators = 1000, max_depth = 3, subsample = 0.8) + +### Fit +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +from xgboost import plot_importance +from xgboost import plot_tree + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() diff --git a/xgb_test_only_xgb_no_coffea.py b/xgb_test_only_xgb_no_coffea.py index dd3b1d1..e8bc064 100644 --- a/xgb_test_only_xgb_no_coffea.py +++ b/xgb_test_only_xgb_no_coffea.py @@ -17,13 +17,16 @@ ## Create the folder to save the data if it doesn't exist and read in the dataframe ### ####################################################################################### net_path = "/net/scratch_cms3a/vaulin/" -folder_save = 'eval_23_04_11' +folder_save = 'eval_23_07_17_2' roi = 'low_mumu' if not os.path.exists(f"./plot/{folder_save}"): os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/ROI_simple"): + os.mkdir(f"./plot/{folder_save}/ROI_simple") + if not os.path.exists(net_path + f"plot/{folder_save}"): os.mkdir(net_path + f"plot/{folder_save}") -df = pd.read_csv(net_path + f'xgb_training_dataset_{roi}.csv') +df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv') time = arrow.now().format("YY_MM_DD") plt.style.use(hep.style.ROOT) @@ -33,6 +36,7 @@ ########## drop target from df and bring it to a separate column, drop weights ######### ######################################################################################## X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) print(X) X = X.drop(f"wei_{roi}", axis = 1) X = X.drop(f"Z_mass_{roi}", axis = 1) @@ -91,6 +95,7 @@ def objective(space): print("SCORE: ", accuracy) return {'loss': -accuracy, 'status': STATUS_OK} + ######################################################################################### ############# Create pipelines for xgb training ######################################### ######################################################################################### @@ -132,7 +137,7 @@ def objective(space): ############################################################################################################ ######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# ############################################################################################################ -cv = RepeatedKFold(n_splits = 5, n_repeats = 20, random_state = 101) +cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101) folds = [(train, test) for train, test in cv.split(X_train, y_train)] #print(folds) metrics = ['auc', 'fpr', 'tpr', 'thresholds'] @@ -193,17 +198,17 @@ def convert(x): return x.tolist() raise TypeError(x) -with open(net_path + f"plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile: +with open(f"./plot/{folder_save}/results_lr_{eta}.json", 'w') as outfile: #json.dump(results, outfile, indent = 4) str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) outfile.write(str_j) -with open(net_path + f"plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile: +with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}.json", 'w') as outfile: #json.dump(results, outfile, indent = 4) str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) outfile.write(str_j) -with open(net_path + f"plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile: +with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}.json", 'w') as outfile: #json.dump(results, outfile, indent = 4) str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) outfile.write(str_j) @@ -247,10 +252,10 @@ def convert(x): fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') -fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.jpg") -fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_eff.pdf") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff.pdf") + -''' fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) @@ -260,20 +265,22 @@ def convert(x): fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') -fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.jpg") -fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej.pdf") -''' +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej.pdf") + ################################################################################################## ########## Actual hyperparameter tuning ########################################################## ################################################################################################## trials = Trials() -#best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) -#print("The best hyperparameters are: ", "\n") -#print(best_hyperparams) - +best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +print("The best hyperparameters are: ", "\n") +print(best_hyperparams) +################################################################################################## +################################################################################################## +################################################################################################## @@ -291,8 +298,8 @@ def convert(x): from sklearn.metrics import accuracy_score ### Init classifier -#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) -xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) ### Fit @@ -349,7 +356,7 @@ def convert(x): plt.title('Importance plot') plt.legend(['']) #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") +plt.savefig(f"./plot/{folder_save}/importance.jpg") feature_importance = model.get_score(importance_type = 'weight') keys = list(feature_importance.keys()) @@ -372,13 +379,13 @@ def convert(x): ax2.set_ylabel("Feature names") ax2.set_title('Importance plot') #plt.show() -plt.savefig(f"plot/{folder_save}/importance_train.jpg") +plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train.jpg") plt.figure(figsize=(17,12)) plot_tree(xgb_cl, fmap = 'feature_map.txt') plt.title('Decision tree graph') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree.jpg", dpi = 1800) ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show()''' @@ -386,7 +393,7 @@ def convert(x): plot_tree(model_xgb, fmap = 'feature_map.txt') plt.title('Decision tree graph') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train.jpg", dpi = 1800) +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train.jpg", dpi = 1800) ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show()''' ''' diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs.py b/xgb_test_only_xgb_no_coffea_diff_bgs.py new file mode 100644 index 0000000..db6adf0 --- /dev/null +++ b/xgb_test_only_xgb_no_coffea_diff_bgs.py @@ -0,0 +1,416 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +####################################################################################### +## Create the folder to save the data if it doesn't exist and read in the dataframe ### +####################################################################################### +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_08' +roi = 'low_mumu' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/ROI_simple"): + os.mkdir(f"./plot/{folder_save}/ROI_simple") + +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv') + + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +bg_choice = 2 +bg_choice_2 = 0 + +eta = 0.03 +#eta = 0.03, 0.12, 0.3, 0.45, 0.8 + +df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)] + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + + +######################################################################################## +########## drop target from df and bring it to a separate column, drop weights ######### +######################################################################################## +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +######################################################################################## +################# GRID search attempt ################################################## +######################################################################################## +''' +from sklearn.model_selection import GridSearchCV + +### Creat the parameter grid +gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + +gbm = xgb.XGBRegressor() + +grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + +grid_mse.fit(X,y) + + +print("Best parameters found: ", grid_mse.best_params_) +print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) +''' + +######################################################################################## +############# An attempt to do hyperparameter tuning for the classifier fit ############ +######################################################################################## +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + + +######################################################################################### +############# Create pipelines for xgb training ######################################### +######################################################################################### +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +######################################################################################### +############ split dataset into training and test ####################################### +######################################################################################### +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +############################################################################################################ +######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# +############################################################################################################ +cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101) +folds = [(train, test) for train, test in cv.split(X_train, y_train)] +#print(folds) +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_zero_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_weak_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta} +with open(net_path + f"plot/{folder_save}/results_first_{eta}.json", 'w') as outfile: + json.dump(results, outfile) + + + +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +for train, test in tqdm(folds, total = len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train,:], + label = y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label = y_train[test]) + model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal + model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal + model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + y_preds_zero_train = model_zero_train.predict(sets[i]) + y_preds_weak_train = model_weak_train.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train) + fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + results_zero_train[ds]['fpr'].append(fpr_zero) + results_zero_train[ds]['tpr'].append(tpr_zero) + results_zero_train[ds]['thresholds'].append(thresholds_zero) + results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) + results_weak_train[ds]['fpr'].append(fpr_weak) + results_weak_train[ds]['tpr'].append(tpr_weak) + results_weak_train[ds]['thresholds'].append(thresholds_weak) + results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train)) + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +########################################################################################################## +############## plotting the ROC curves with uncertainties ################################################ +########################################################################################################## +kind = 'val' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf") + + +fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.pdf") + +################################################################################################## +########## Actual hyperparameter tuning ########################################################## +################################################################################################## + +trials = Trials() + +best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +print("The best hyperparameters are: ", "\n") +print(best_hyperparams) + +################################################################################################## +################################################################################################## +################################################################################################## + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg") + +feature_importance = model.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py b/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py new file mode 100644 index 0000000..9f09fd5 --- /dev/null +++ b/xgb_test_only_xgb_no_coffea_diff_bgs_3bgs.py @@ -0,0 +1,417 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +####################################################################################### +## Create the folder to save the data if it doesn't exist and read in the dataframe ### +####################################################################################### +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_08' +roi = 'low_mumu' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/ROI_simple"): + os.mkdir(f"./plot/{folder_save}/ROI_simple") + +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv') + + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +bg_choice = 2 +bg_choice_2 = 0 +bg_choice_3 = 1 + +eta = 0.03 +#eta = 0.03, 0.12, 0.3, 0.45, 0.8 + +df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)|(df.target_bg == bg_choice_3+1)] + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + + +######################################################################################## +########## drop target from df and bring it to a separate column, drop weights ######### +######################################################################################## +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +######################################################################################## +################# GRID search attempt ################################################## +######################################################################################## +''' +from sklearn.model_selection import GridSearchCV + +### Creat the parameter grid +gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + +gbm = xgb.XGBRegressor() + +grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + +grid_mse.fit(X,y) + + +print("Best parameters found: ", grid_mse.best_params_) +print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) +''' + +######################################################################################## +############# An attempt to do hyperparameter tuning for the classifier fit ############ +######################################################################################## +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + + +######################################################################################### +############# Create pipelines for xgb training ######################################### +######################################################################################### +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +######################################################################################### +############ split dataset into training and test ####################################### +######################################################################################### +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +############################################################################################################ +######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# +############################################################################################################ +cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101) +folds = [(train, test) for train, test in cv.split(X_train, y_train)] +#print(folds) +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_zero_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_weak_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta} +with open(net_path + f"plot/{folder_save}/results_first_{eta}.json", 'w') as outfile: + json.dump(results, outfile) + + + +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +for train, test in tqdm(folds, total = len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train,:], + label = y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label = y_train[test]) + model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal + model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal + model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + y_preds_zero_train = model_zero_train.predict(sets[i]) + y_preds_weak_train = model_weak_train.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train) + fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + results_zero_train[ds]['fpr'].append(fpr_zero) + results_zero_train[ds]['tpr'].append(tpr_zero) + results_zero_train[ds]['thresholds'].append(thresholds_zero) + results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) + results_weak_train[ds]['fpr'].append(fpr_weak) + results_weak_train[ds]['tpr'].append(tpr_weak) + results_weak_train[ds]['thresholds'].append(thresholds_weak) + results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train)) + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +########################################################################################################## +############## plotting the ROC curves with uncertainties ################################################ +########################################################################################################## +kind = 'val' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf") + + +fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.pdf") + +################################################################################################## +########## Actual hyperparameter tuning ########################################################## +################################################################################################## + +trials = Trials() + +best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +print("The best hyperparameters are: ", "\n") +print(best_hyperparams) + +################################################################################################## +################################################################################################## +################################################################################################## + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg") + +feature_importance = model.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}_{bgs[bg_choice_3]}_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py b/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py new file mode 100644 index 0000000..ab9f632 --- /dev/null +++ b/xgb_test_only_xgb_no_coffea_diff_bgs_all etas.py @@ -0,0 +1,530 @@ + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json +from tkinter import filedialog as fd +from tkinter import messagebox + +from tkinter import * + +window = Tk() +window.title("XGBoost training") +window.minsize(500,300) + +###Creating a label, pack() just puts it on screen +label = Label(master = window ,text = " Welcome to XGBoost training\n Choose the channel to start with:") +label.pack(side = TOP, expand = True) +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +def main(back, eta, chan, file): + ####################################################################################### + ## Create the folder to save the data if it doesn't exist and read in the dataframe ### + ####################################################################################### + net_path = "/net/scratch_cms3a/vaulin/" + folder_save = 'eval_23_07_19' + roi = 'low_mumu' + if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") + if not os.path.exists(f"./plot/{folder_save}/ROI_simple"): + os.mkdir(f"./plot/{folder_save}/ROI_simple") + + df = pd.read_csv(file) + + + bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] + bg_choice = back + + df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)] + + time = arrow.now().format("YY_MM_DD") + plt.style.use(hep.style.ROOT) + + + ######################################################################################## + ########## drop target from df and bring it to a separate column, drop weights ######### + ######################################################################################## + X = df.drop("target", axis = 1) + X = X.drop("target_bg", axis = 1) + print(X) + X = X.drop(f"wei_{roi}", axis = 1) + X = X.drop(f"Z_mass_{roi}", axis = 1) + X = X.drop(f"Z_pt_gen_{roi}", axis = 1) + X = X.drop(f"Z_mass_gen_{roi}", axis = 1) + print(X) + print(X.info()) + + y = df["target"] + print(y) + + + ######################################################################################## + ################# GRID search attempt ################################################## + ######################################################################################## + ''' + # ============================================================================= + # from sklearn.model_selection import GridSearchCV + # + # ### Creat the parameter grid + # gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + # + # gbm = xgb.XGBRegressor() + # + # grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + # + # grid_mse.fit(X,y) + # + # + # print("Best parameters found: ", grid_mse.best_params_) + # print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) + # ============================================================================= + ''' + + ######################################################################################## + ############# An attempt to do hyperparameter tuning for the classifier fit ############ + ######################################################################################## + space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + + #learning_rate = space['learning_rate'], + + + + + ######################################################################################### + ############# Create pipelines for xgb training ######################################### + ######################################################################################### + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + + from sklearn.preprocessing import StandardScaler + numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + + cat_cols = X.select_dtypes(exclude = "number").columns + num_cols = X.select_dtypes(include = "number").columns + + print(cat_cols) + print(num_cols) + + from sklearn.compose import ColumnTransformer + + full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + + X_processed = full_processor.fit_transform(X) + y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + + ######################################################################################### + ############ split dataset into training and test ####################################### + ######################################################################################### + from sklearn.model_selection import train_test_split + global X_train + global X_test + global y_train + global y_test + X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) + #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) + print(X_train) + print(X_test) + print(y_train) + + ############################################################################################################ + ######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# + ############################################################################################################ + cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101) + folds = [(train, test) for train, test in cv.split(X_train, y_train)] + #print(folds) + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_zero_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_weak_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + eta = eta + + params = {'objective': 'binary:logistic', + 'eval_metric': 'logloss', 'eta': eta} + with open(f"./plot/{folder_save}/results_first_{eta}.json", 'w') as outfile: + json.dump(results, outfile) + + dtest = xgb.DMatrix(X_test, label=y_test) + #print(dtest) + for train, test in tqdm(folds, total=len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train, :], + label=y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label=y_train[test]) + model = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')], + verbose_eval=1, early_stopping_rounds=10, num_boost_round=200) # num_boost_round = 1000, 200 is optimal + model_zero_train = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')], + verbose_eval=1, early_stopping_rounds=10, num_boost_round=0) # num_boost_round = 1000, 200 is optimal + model_weak_train = xgb.train(dtrain=dtrain, params=params, evals=[(dtrain, 'train'), (dval, 'dval')], + verbose_eval=1, early_stopping_rounds=10, num_boost_round=20) # num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + y_preds_zero_train = model_zero_train.predict(sets[i]) + y_preds_weak_train = model_weak_train.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + fpr_zero, tpr_zero, thresholds_zero = roc_curve( + labels, y_preds_zero_train) + fpr_weak, tpr_weak, thresholds_weak = roc_curve( + labels, y_preds_weak_train) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + results_zero_train[ds]['fpr'].append(fpr_zero) + results_zero_train[ds]['tpr'].append(tpr_zero) + results_zero_train[ds]['thresholds'].append(thresholds_zero) + results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) + results_weak_train[ds]['fpr'].append(fpr_weak) + results_weak_train[ds]['tpr'].append(tpr_weak) + results_weak_train[ds]['thresholds'].append(thresholds_weak) + results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train)) + + + + with open(f"./plot/{folder_save}/results_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + + with open(f"./plot/{folder_save}/results_zero_train_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + + with open(f"./plot/{folder_save}/results_weak_train_lr_{eta}_bg_{bgs[bg_choice]}.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + + ########################################################################################################## + ############## plotting the ROC curves with uncertainties ################################################ + ########################################################################################################## + kind = 'val' + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + import plotly.graph_objects as go + + fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.jpg") + fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_eff_{eta}.pdf") + + + fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{eta}.jpg") + fig.write_image(f"./plot/{folder_save}/ROI_simple/plotly_ROC_bg_rej_bg_{bgs[bg_choice]}_{eta}.pdf") + + ################################################################################################## + ########## Actual hyperparameter tuning ########################################################## + ################################################################################################## + + trials = Trials() + + best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) + print("The best hyperparameters are: ", "\n") + print(best_hyperparams) + + ################################################################################################## + ################################################################################################## + ################################################################################################## + + + + + + + + + + + + + + + from sklearn.metrics import accuracy_score + + ### Init classifier + xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) + #xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + + ### Fit + dtest = xgb.DMatrix(X_test, label = y_test) + #print(dtest) + dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) + dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) + model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, + sets = [dtrain, dval, dtest] + results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + + for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + + xgb_cl.fit(X_train, y_train) + + print(xgb_cl) + ### Predict + preds = xgb_cl.predict(X_test) + + print(accuracy_score(y_test, preds)) + + print(y_test) + print(model_xgb.predict(dtest)) + print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) + predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + + print(accuracy_score(y_test, predict_train)) + + from xgboost import plot_importance + from xgboost import plot_tree, to_graphviz + + importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) + importances = importances.sort_values(by = "Importance", ascending = False) + importances = importances.set_index('Feature') + print(importances) + importances.plot.bar() + + fig, ax = plt.subplots(figsize=(17,12)) + plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) + plt.xlabel('Feature scores') + plt.ylabel("Feature names") + plt.title('Importance plot') + plt.legend(['']) + #plt.show() + plt.savefig(f"./plot/{folder_save}/importance_bg_{bgs[bg_choice]}_{eta}.jpg") + + feature_importance = model.get_score(importance_type = 'weight') + keys = list(feature_importance.keys()) + names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + values = list(feature_importance.values()) + data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) + print(data) + print(data.index) + + + fig = plt.figure(figsize=(17,12)) + ax1 = fig.add_subplot(1,2,1) + ax1.set_axis_off() + ax2 = fig.add_subplot(1,2,2) + ax2.barh(list(reversed(data.index)), list(reversed(data.score))) + ax2.set_xlabel('Feature scores') + ax2.set_ylabel("Feature names") + ax2.set_title('Importance plot') + #plt.show() + plt.savefig(f"./plot/{folder_save}/ROI_simple/importance_train_bg_{bgs[bg_choice]}_{eta}.jpg") + + plt.figure(figsize=(17,12)) + plot_tree(xgb_cl, fmap = 'feature_map.txt') + plt.title('Decision tree graph') + #plt.show() + plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_bg_{bgs[bg_choice]}_{eta}.jpg", dpi = 1800) + ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 + #plt.show()''' + + #plt.figure(figsize=(17,12)) + #plot_tree(model_xgb, fmap = 'feature_map.txt') + #plt.title('Decision tree graph') + #plt.show() + #plt.savefig(f"./plot/{folder_save}/ROI_simple/boost_tree_train_bg_{bgs[bg_choice]}_{eta}.jpg", dpi = 1800) + ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 + #plt.show()''' +''' + plt.figure(figsize=(17,12)) + to_graphviz(model_xgb, fmap = 'feature_map.txt') + plt.title('Decision tree graph') + #plt.show() + plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) + ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 + #plt.show()''' + + +###Checkbutton +def checkused(): + global checkbutton + ##Returns 1 if used, otherwise 0 + print(checked_state.get()) + label.config(text = "Enjoy the ride - it will take some time") + checkbutton.pack_forget() + main(back_to_use[0], eta_to_use[1], chan_to_use[1], file_to_use) + + + +checked_state = IntVar() +checkbutton = Checkbutton(text = "Ready to start?", variable = checked_state, command = checkused) + + + + +back_to_use = 0 +eta_to_use = 0 +chan_to_use = 0 +file_to_use = 0 + +###Listbox +def listboxused(event): + global back_to_use + global listbox_eta + #print(listbox.get(listbox.curselection())) + back_to_use = int(list(listbox.curselection())[0]), listbox.get(listbox.curselection()) + label.config(text = "Now choose the learning rate eta to work with") + listbox.pack_forget() + + + listbox_eta.pack() + +def listboxused_eta(event): + global eta_to_use + global checkbutton + print(listbox_eta.get(listbox_eta.curselection())) + eta_to_use = int(list(listbox_eta.curselection())[0]), listbox_eta.get(listbox_eta.curselection()) + label.config(text = "You are ready to start") + listbox_eta.pack_forget() + messagebox.showinfo(title = "Training info", + message = f"Channel: {chan_to_use[1]}\nFile: {file_to_use}\nBackground: {back_to_use[1]}\nLearning rate (eta): {eta_to_use[1]}") + checkbutton.pack() + +###Listbox +def listboxused_chan(event): + global chan_to_use + global file_to_use + global listbox + + print(listbox_chan.get(listbox_chan.curselection())) + chan_to_use = int(list(listbox_chan.curselection())[0]), listbox_chan.get(listbox_chan.curselection()) + label.config(text = "Now choose the background to work with") + listbox_chan.pack_forget() + filetypes = ( + ("text files", "*.txt"), + ("csv files", "*.csv"), + ("All files", "*.*")) + filename = fd.askopenfile( + title = "Open a file", + initialdir = "./plot/", + filetypes = filetypes) + + messagebox.showinfo(title = "Selected file", + message = filename.name) + file_to_use = filename.name + + listbox.pack() + + +listbox = Listbox(height = 5) +names = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +for bglist in names: + listbox.insert(names.index(bglist), bglist) +listbox.bind("<>", listboxused) + +listbox_eta = Listbox(height = 5) +etas = [0.03, 0.12, 0.3, 0.45, 0.8] +for eta in etas: + listbox_eta.insert(etas.index(eta), eta) +listbox_eta.bind("<>", listboxused_eta) +#listbox_eta.pack() + +listbox_chan = Listbox(height = 4) +names_chan = ['low_mumu', 'low_ee', 'high_mumu', 'high_ee'] +for chan in names_chan: + listbox_chan.insert(names_chan.index(chan), chan) +listbox_chan.bind("<>", listboxused_chan) +listbox_chan.pack() + + + + +window.mainloop() diff --git a/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py b/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py new file mode 100644 index 0000000..f8d2e2b --- /dev/null +++ b/xgb_test_only_xgb_no_coffea_diff_bgs_full_bg_set.py @@ -0,0 +1,418 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +####################################################################################### +## Create the folder to save the data if it doesn't exist and read in the dataframe ### +####################################################################################### +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_23_2' +roi = 'low_mumu' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/ROI_simple"): + os.mkdir(f"./plot/{folder_save}/ROI_simple") +if not os.path.exists(f"./plot/{folder_save}/ROI_simple/{roi}"): + os.mkdir(f"./plot/{folder_save}/ROI_simple/{roi}") + +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv') + + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +#bg_choice = 2 +#bg_choice_2 = 0 + +eta = 0.1 +#eta = 0.03, 0.12, 0.3, 0.45, 0.8 + +#df = df[(df.target_bg == 0)|(df.target_bg == bg_choice+1)|(df.target_bg == bg_choice_2+1)] + +time = arrow.now().format("YY_MM_DD") +plt.style.use(hep.style.ROOT) + + +######################################################################################## +########## drop target from df and bring it to a separate column, drop weights ######### +######################################################################################## +X = df.drop("target", axis = 1) +X = X.drop("target_bg", axis = 1) +print(X) +X = X.drop(f"wei_{roi}", axis = 1) +X = X.drop(f"Z_mass_{roi}", axis = 1) +X = X.drop(f"Z_pt_gen_{roi}", axis = 1) +X = X.drop(f"Z_mass_gen_{roi}", axis = 1) +print(X) +print(X.info()) + +y = df["target"] +print(y) + + +######################################################################################## +################# GRID search attempt ################################################## +######################################################################################## +''' +from sklearn.model_selection import GridSearchCV + +### Creat the parameter grid +gbm_param_grid = {'max_depth' : [3, 4, 5, 6, 7, 8, 9], 'min_child_weight' : [1], 'gamma' : [0], 'subsample' : [0.8], 'colsample_bytree' : [0.8], 'reg_alpha' : [0.005], 'n_estimators': [1000]} + +gbm = xgb.XGBRegressor() + +grid_mse = GridSearchCV(param_grid = gbm_param_grid, estimator = gbm, scoring = 'neg_mean_squared_error', cv = 4, verbose = 1) + +grid_mse.fit(X,y) + + +print("Best parameters found: ", grid_mse.best_params_) +print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_))) +''' + +######################################################################################## +############# An attempt to do hyperparameter tuning for the classifier fit ############ +######################################################################################## +space = {"max_depth": hp.quniform("max_depth", 3, 18, 1), + "gamma": hp.uniform("gamma", 1, 9), + "reg_alpha": hp.quniform("reg_alpha", 40, 180, 1), + "reg_lambda": hp.uniform("reg_lambda", 0, 1), + "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1), + "min_child_weight": hp.quniform("min_child_weight", 0, 10, 1), + "n_estimators": 200, + "learning_rate": hp.uniform("learning_rate", 0.001, 0.1), + "subsample": hp.uniform("subsample", 0.8, 1), + "seed":0} + +#learning_rate = space['learning_rate'], + +def objective(space): + clf = xgb.XGBClassifier( n_estimators = int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'], reg_alpha = int(space['reg_alpha']), min_child_weight = int(space['min_child_weight']), colsample_bytree = int(space['colsample_bytree']), eval_metric = 'auc', early_stopping_rounds = 10) + evaluation = [(X_train, y_train), (X_test, y_test)] + + clf.fit(X_train, y_train, eval_set = evaluation, verbose = False) + pred = clf.predict(X_test) + accuracy = accuracy_score(y_test, pred>0.5) + print("SCORE: ", accuracy) + return {'loss': -accuracy, 'status': STATUS_OK} + + +######################################################################################### +############# Create pipelines for xgb training ######################################### +######################################################################################### +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import OneHotEncoder + +categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + +from sklearn.preprocessing import StandardScaler +numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + +cat_cols = X.select_dtypes(exclude = "number").columns +num_cols = X.select_dtypes(include = "number").columns + +print(cat_cols) +print(num_cols) + +from sklearn.compose import ColumnTransformer + +full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + +X_processed = full_processor.fit_transform(X) +y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + +######################################################################################### +############ split dataset into training and test ####################################### +######################################################################################### +from sklearn.model_selection import train_test_split + +X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, stratify = y_processed, random_state = 1121218) +#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) +print(X_train) +print(X_test) +print(y_train) + +############################################################################################################ +######### preparing the XGB classifiers in 20 x 5-folds cross validation using repeated k-fold ############# +############################################################################################################ +cv = RepeatedKFold(n_splits = 8, n_repeats = 20, random_state = 101) +folds = [(train, test) for train, test in cv.split(X_train, y_train)] +#print(folds) +metrics = ['auc', 'fpr', 'tpr', 'thresholds'] +results = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_zero_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +results_weak_train = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + +params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': eta} +with open(net_path + f"plot/{folder_save}/results_first_{roi}_{eta}.json", 'w') as outfile: + json.dump(results, outfile) + + + +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +for train, test in tqdm(folds, total = len(folds)): + print('train') + dtrain = xgb.DMatrix(X_train[train,:], + label = y_train[train]) + dval = xgb.DMatrix(X_train[test, :], label = y_train[test]) + model = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 200) #num_boost_round = 1000, 200 is optimal + model_zero_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 0) #num_boost_round = 1000, 200 is optimal + model_weak_train = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 10, num_boost_round = 20) #num_boost_round = 1000, 200 is optimal + sets = [dtrain, dval, dtest] + for i, ds in enumerate(results.keys()): + print(i) + y_preds = model.predict(sets[i]) + y_preds_zero_train = model_zero_train.predict(sets[i]) + y_preds_weak_train = model_weak_train.predict(sets[i]) + labels = sets[i].get_label() + fpr, tpr, thresholds = roc_curve(labels, y_preds) + fpr_zero, tpr_zero, thresholds_zero = roc_curve(labels, y_preds_zero_train) + fpr_weak, tpr_weak, thresholds_weak = roc_curve(labels, y_preds_weak_train) + results[ds]['fpr'].append(fpr) + results[ds]['tpr'].append(tpr) + results[ds]['thresholds'].append(thresholds) + results[ds]['auc'].append(roc_auc_score(labels, y_preds)) + results_zero_train[ds]['fpr'].append(fpr_zero) + results_zero_train[ds]['tpr'].append(tpr_zero) + results_zero_train[ds]['thresholds'].append(thresholds_zero) + results_zero_train[ds]['auc'].append(roc_auc_score(labels, y_preds_zero_train)) + results_weak_train[ds]['fpr'].append(fpr_weak) + results_weak_train[ds]['tpr'].append(tpr_weak) + results_weak_train[ds]['thresholds'].append(thresholds_weak) + results_weak_train[ds]['auc'].append(roc_auc_score(labels, y_preds_weak_train)) + +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + +with open(f"./plot/{folder_save}/results_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_zero_train_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_zero_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +with open(f"./plot/{folder_save}/results_weak_train_lr_{roi}_{eta}_bg_full_bg_set.json", 'w') as outfile: + #json.dump(results, outfile, indent = 4) + str_j = json.dumps(results_weak_train, indent = 4, sort_keys = True, default=convert) + outfile.write(str_j) + +########################################################################################################## +############## plotting the ROC curves with uncertainties ################################################ +########################################################################################################## +kind = 'val' + +c_fill = 'rgba(52, 152, 219, 0.2)' +c_line = 'rgba(52, 152, 219, 0.5)' +c_line_main = 'rgba(41, 128, 185, 1.0)' +c_grid = 'rgba(189, 195, 199, 0.5)' +c_annot = 'rgba(149, 165, 166, 0.5)' +c_highlight = 'rgba(192, 57, 43, 1.0)' + +fpr_mean = np.linspace(0, 1, 100) + +interp_tprs = [] +for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) +tpr_mean = np.mean(interp_tprs, axis = 0) +tpr_mean[-1] = 1.0 +tpr_std = 2*np.std(interp_tprs, axis = 0) +tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) +tpr_lower = tpr_mean - tpr_std +auc = np.mean(results[kind]['auc']) + +import plotly.graph_objects as go + +fig = go.Figure([go.Scatter(x = tpr_upper, y = fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 0, y1 = 1) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 1600, height = 900, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_eff_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_eff_{eta}.pdf") + + +fig = go.Figure([go.Scatter(x = 1 - fpr_mean, y = tpr_upper, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = 1 - fpr_mean, y = tpr_lower, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = 1 - fpr_mean, y = tpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.3f}')]) + +fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) +fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = '1 - FPR (Background rejection)', yaxis_title = 'TPR (signal efficiency)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) +fig.update_yaxes(range = [0,1], gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') +fig.update_xaxes(range = [0,1], gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + +fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_rej_bg_full_bg_set_{eta}.jpg") +fig.write_image(f"./plot/{folder_save}/ROI_simple/{roi}/plotly_ROC_bg_rej_bg_full_bg_set_{eta}.pdf") + +################################################################################################## +########## Actual hyperparameter tuning ########################################################## +################################################################################################## + +trials = Trials() + +best_hyperparams = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 100, trials = trials) +print("The best hyperparameters are: ", "\n") +print(best_hyperparams) + +################################################################################################## +################################################################################################## +################################################################################################## + + + + + + + + + + + + + + +from sklearn.metrics import accuracy_score + +### Init classifier +xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = best_hyperparams['learning_rate'], gamma = best_hyperparams['gamma'], reg_alpha = best_hyperparams['reg_alpha'], reg_lambda = best_hyperparams['reg_lambda'], n_estimators = 200, max_depth = int(best_hyperparams['max_depth']), subsample = best_hyperparams['subsample'], min_child_weight = best_hyperparams['min_child_weight'], colsample_bytree = best_hyperparams['colsample_bytree']) +#xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994) + + +### Fit +dtest = xgb.DMatrix(X_test, label = y_test) +#print(dtest) +dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8), :], label = y_train[:int(len(y_train)*0.8)]) +dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):, :], label = y_train[int(len(y_train)*0.8):]) +model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, +sets = [dtrain, dval, dtest] +results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} +params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + +for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + +xgb_cl.fit(X_train, y_train) + +print(xgb_cl) +### Predict +preds = xgb_cl.predict(X_test) + +print(accuracy_score(y_test, preds)) + +print(y_test) +print(model_xgb.predict(dtest)) +print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) +predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + +print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +importances = importances.sort_values(by = "Importance", ascending = False) +importances = importances.set_index('Feature') +print(importances) +importances.plot.bar() + +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"./plot/{folder_save}/importance_bg_{roi}_full_bg_set_{eta}.jpg") + +feature_importance = model.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/importance_train_bg_full_bg_set_{eta}.jpg") + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/boost_tree_bg_full_bg_set_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"./plot/{folder_save}/ROI_simple/{roi}/boost_tree_train_bg_full_bg_set_{eta}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea.py b/xgb_test_only_xgb_reloaded_no_coffea.py index bc91384..eaebb78 100644 --- a/xgb_test_only_xgb_reloaded_no_coffea.py +++ b/xgb_test_only_xgb_reloaded_no_coffea.py @@ -14,7 +14,7 @@ import json net_path = "/net/scratch_cms3a/vaulin/" -folder_save = 'eval_23_05_02' +folder_save = 'eval_23_06_26_2' if not os.path.exists(f"./plot/{folder_save}"): os.mkdir(f"./plot/{folder_save}") if not os.path.exists(net_path + f"plot/{folder_save}"): @@ -133,22 +133,22 @@ def pretty_ROC_Curve(tr_set, kind, type): fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') - fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg") - fig.write_image(net_path + f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf") + fig.write_image(f"./plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.jpg") + fig.write_image(f"./plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}.pdf") -pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full") +pretty_ROC_Curve(f"./plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full") ############################################################################################################################################################## ##################### Zero train ROC ######################################################################################################################### ############################################################################################################################################################## -pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero') +pretty_ROC_Curve(f"./plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero') ############################################################################################################################################################## ##################### Weak train ROC ######################################################################################################################### ############################################################################################################################################################## -pretty_ROC_Curve(net_path + f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak') +pretty_ROC_Curve(f"./plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak') ############################################################################################################################################################## @@ -223,7 +223,7 @@ def pretty_ROC_Curve(tr_set, kind, type): plt.title('Importance plot') plt.legend(['']) #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/importance.jpg") +plt.savefig(f"./plot/{folder_save}/importance.jpg") feature_importance = model_xgb.get_score(importance_type = 'weight') @@ -247,14 +247,14 @@ def pretty_ROC_Curve(tr_set, kind, type): ax2.set_ylabel("Feature names") ax2.set_title('Importance plot') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/importance_train_lr_{learning_rate}.jpg") +plt.savefig(f"./plot/{folder_save}/importance_train_lr_{learning_rate}.jpg") plt.figure(figsize=(17,12)) plot_tree(xgb_cl, fmap = 'feature_map.txt') plt.title('Decision tree graph') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/boost_tree.jpg", dpi = 1800) +plt.savefig(f"./plot/{folder_save}/boost_tree.jpg", dpi = 1800) ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show()''' @@ -262,7 +262,7 @@ def pretty_ROC_Curve(tr_set, kind, type): plot_tree(model_xgb, fmap = 'feature_map.txt') plt.title('Decision tree graph') #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800) +plt.savefig(f"./plot/{folder_save}/boost_tree_train_lr_{learning_rate}.jpg", dpi = 1800) ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show()''' @@ -273,7 +273,7 @@ def pretty_ROC_Curve(tr_set, kind, type): plt.title('Classifier output') plt.legend(['Train output', 'Train output after threshold','Test data']) #plt.show() -plt.savefig(net_path + f"plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg") +plt.savefig(f"./plot/{folder_save}/class_output_train_lr_{learning_rate}.jpg") ###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 #plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea_var.py b/xgb_test_only_xgb_reloaded_no_coffea_var.py index 4451ef9..6e1aa4c 100644 --- a/xgb_test_only_xgb_reloaded_no_coffea_var.py +++ b/xgb_test_only_xgb_reloaded_no_coffea_var.py @@ -14,7 +14,7 @@ import json net_path = "/net/scratch_cms3a/vaulin/" -folder_save = 'eval_23_05_02' +folder_save = 'eval_23_06_26_2' if not os.path.exists(f"./plot/{folder_save}"): os.mkdir(f"./plot/{folder_save}") if not os.path.exists(net_path + f"plot/{folder_save}"): @@ -29,11 +29,14 @@ 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', 'del_phi_l2_subleading', 'del_phi_l2_leading'] -var = f'Higgs_mass_{roi}' +var = f'del_phi_l2_subleading_{roi}' time = arrow.now().format("YY_MM_DD") plt.style.use(hep.style.ROOT) + +df = df.sample(frac = 1).reset_index(drop=True) + X = df[var] print(X) print(X.info()) @@ -271,8 +274,68 @@ def pretty_ROC_Curve_var(results, kind, type, var): fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg") fig.write_image(f"plot/{folder_save}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") +def pretty_ROC_Curve_var_test_train_val(results, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_train = 'rgba(41, 128, 185, 1.0)' + c_line_test = 'rgba(58, 217, 19, 0.8)' + c_line_val = 'rgba(244, 70, 10, 0.8)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val} + fig_test = 0 + fig_train = 0 + fig_val = 0 + figs = {'test': fig_test, 'train': fig_train, 'val': fig_val} + for kind in ['test', 'val', 'train']: + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + colour = colours[kind] + + + figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}') + fig = go.Figure(data = [figs['test'], figs['train'], figs['val']]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + if not os.path.exists(f"plot/{folder_save}/ROC"): + os.mkdir(f"plot/{folder_save}/ROC") + + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") + pretty_ROC_Curve_var(results_new, 'test', 'full', var) +pretty_ROC_Curve_var_test_train_val(results_new, 'full', var) + xgb_cl.fit(X_train, y_train) print(xgb_cl) @@ -318,7 +381,7 @@ def pretty_ROC_Curve_var(results, kind, type, var): '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$']''' -names_sig = ['m(H)'] +names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] values = list(feature_importance.values()) data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) print(data) diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars.py b/xgb_test_only_xgb_reloaded_no_coffea_vars.py new file mode 100644 index 0000000..93eed4e --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea_vars.py @@ -0,0 +1,521 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_07_19' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"): + os.mkdir(f"./plot/{folder_save}/Diff_ROCs") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv(net_path + 'xgb_training_dataset_low_mumu.csv') + +roi = 'low_mumu' +learning_rate = 0.12 + +from itertools import combinations + +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_ind_array = np.arange(len(names_sig)) + +possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))] +#print(possible_combos) +print(possible_combos[1]) +#print(possible_combos[-1]) + + +length = [len(possible_el) for possible_el in possible_combos] +print(length) + +import random +sequence_list = np.arange(0,len(names_sig)) +#print(sequence_list) +random.shuffle(sequence_list) +print(sequence_list) + +interesting_combos = [] +combos = [] + +for i in range(0, len(length)): + #print([len(elem) for elem in possible_combos[i]]) + for j in range(0, len(possible_combos[i])): + #print(list(possible_combos[i][j])) + #print(list(sequence_list[:i])) + if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])): + print(sorted(list(possible_combos[i][j]))) + print(sorted(list(sequence_list[:(i+1)]))) + print(i, j) + combos.append([i,j]) + interesting_combos.append(sorted(list(possible_combos[i][j]))) + +print(combos) +#for k in range(0,len(combos)): + +print(interesting_combos) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type, var): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf") + +def pretty_ROC_Curve_var(results, kind, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") + +def pretty_ROC_Curve_var_test_train_val(results, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_train = 'rgba(41, 128, 185, 1.0)' + c_line_test = 'rgba(58, 217, 19, 0.8)' + c_line_val = 'rgba(244, 70, 10, 0.8)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val} + fig_test = 0 + fig_train = 0 + fig_val = 0 + figs = {'test': fig_test, 'train': fig_train, 'val': fig_val} + for kind in ['test', 'val', 'train']: + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + colour = colours[kind] + + + figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}') + fig = go.Figure(data = [figs['test'], figs['train'], figs['val']]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + if not os.path.exists(f"plot/{folder_save}/ROC"): + os.mkdir(f"plot/{folder_save}/ROC") + + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") +############################################################################################################################### + +for versions in interesting_combos: + versions_true = [int(version) for version in versions] + versions = [True if value in versions_true else False for value in range(0, len(names_sig))] + print(versions) + print(np.array(names_sig)[versions]) + var = np.array(names_sig)[versions] + var = [f"{va}_{roi}" for va in var] + + time = arrow.now().format("YY_MM_DD") + plt.style.use(hep.style.ROOT) + + + df = df.sample(frac = 1).reset_index(drop=True) + + X = df[list(var)] + print(X) + print(X.info()) + + X_signal = df[var][df.target == 1] + X_bg = df[var][df.target == 0] + + y = df["target"] + print(y) + + y_signal = df["target"][df.target == 1] + y_bg = df["target"][df.target == 0] + + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + + from sklearn.preprocessing import StandardScaler + numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + + cat_cols = X.select_dtypes(exclude = "number").columns + num_cols = X.select_dtypes(include = "number").columns + + print(cat_cols) + print(num_cols) + + from sklearn.compose import ColumnTransformer + + full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + + X_processed = full_processor.fit_transform(X) + y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + + y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1)) + y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1)) + + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218) + X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218) + X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218) + #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) + print(X_train) + print(X_test) + print(y_train) + + + + + + pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}.json", kind, "full", versions_true) +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}.json", kind, 'zero', versions_true) + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}.json", kind, 'weak', versions_true) + +############################################################################################################################################################## + + + trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + + from sklearn.metrics import accuracy_score + +### Init classifier + xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10) + +### Fit + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + dtest = xgb.DMatrix(X_test, label = y_test) + dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig) + dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg) +#print(dtest) + dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)]) + dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):]) + model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, + model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000, + model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000, + sets = [dtrain, dval, dtest] + results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_weak = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_zero = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + + for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + y_preds_new_weak = model_xgb_weak.predict(sets[i]) + y_preds_new_zero = model_xgb_zero.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak) + fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + results_new_weak[ds]['fpr'].append(fpr_new_weak) + results_new_weak[ds]['tpr'].append(tpr_new_weak) + results_new_weak[ds]['thresholds'].append(thresholds_new_weak) + results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) + results_new_zero[ds]['fpr'].append(fpr_new_zero) + results_new_zero[ds]['tpr'].append(tpr_new_zero) + results_new_zero[ds]['thresholds'].append(thresholds_new_zero) + results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero)) + + + + pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true) + + pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true) + + xgb_cl.fit(X_train, y_train) + + print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### + preds = xgb_cl.predict(X_test) + + print(accuracy_score(y_test, preds)) + + print(y_test) + print(model_xgb.predict(dtest)) + print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) + predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)]) + predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)]) + + print(accuracy_score(y_test, predict_train)) + +from xgboost import plot_importance +from xgboost import plot_tree, to_graphviz + +#importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) +#importances = importances.sort_values(by = "Importance", ascending = False) +#importances = importances.set_index('Feature') +#print(importances) +#importances.plot.bar() +''' +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_{var}.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] +names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() +''' +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False) +plt.title('Classifier output') +plt.legend(['Signal', 'Background']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak.jpg") + +plt.figure(figsize=(17,12)) +plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) +plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False) +plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) +plt.title('Classifier output') +plt.legend(['Train output', 'Train output after threshold','Test data']) +#plt.show() +plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero.jpg") + +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +with open(f"plot/{folder_save}/ROC.txt", "a") as myfile: + myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + " " + '\n') + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py new file mode 100644 index 0000000..c5ad2ac --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg.py @@ -0,0 +1,524 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_07_19' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"): + os.mkdir(f"./plot/{folder_save}/Diff_ROCs") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv('xgb_training_dataset_low_mumu.csv') + +roi = 'low_mumu' +learning_rate = 0.12 + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +bg_choice = 2 + +from itertools import combinations + +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_ind_array = np.arange(len(names_sig)) + +possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))] +#print(possible_combos) +print(possible_combos[1]) +#print(possible_combos[-1]) + + +length = [len(possible_el) for possible_el in possible_combos] +print(length) + +import random +sequence_list = np.arange(0,len(names_sig)) +#print(sequence_list) +random.shuffle(sequence_list) +print(sequence_list) + +interesting_combos = [] +combos = [] + +for i in range(0, len(length)): + #print([len(elem) for elem in possible_combos[i]]) + for j in range(0, len(possible_combos[i])): + #print(list(possible_combos[i][j])) + #print(list(sequence_list[:i])) + if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])): + print(sorted(list(possible_combos[i][j]))) + print(sorted(list(sequence_list[:(i+1)]))) + print(i, j) + combos.append([i,j]) + interesting_combos.append(sorted(list(possible_combos[i][j]))) + +print(combos) +#for k in range(0,len(combos)): + +print(interesting_combos) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type, var): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}.pdf") + +def pretty_ROC_Curve_var(results, kind, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}.pdf") + +def pretty_ROC_Curve_var_test_train_val(results, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_train = 'rgba(41, 128, 185, 1.0)' + c_line_test = 'rgba(58, 217, 19, 0.8)' + c_line_val = 'rgba(244, 70, 10, 0.8)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val} + fig_test = 0 + fig_train = 0 + fig_val = 0 + figs = {'test': fig_test, 'train': fig_train, 'val': fig_val} + for kind in ['test', 'val', 'train']: + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + colour = colours[kind] + + + figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}') + fig = go.Figure(data = [figs['test'], figs['train'], figs['val']]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + if not os.path.exists(f"plot/{folder_save}/ROC"): + os.mkdir(f"plot/{folder_save}/ROC") + + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}.jpg") + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}.pdf") +############################################################################################################################### + +for versions in interesting_combos: + versions_true = [int(version) for version in versions] + versions = [True if value in versions_true else False for value in range(0, len(names_sig))] + print(versions) + print(np.array(names_sig)[versions]) + var = np.array(names_sig)[versions] + var = [f"{va}_{roi}" for va in var] + + time = arrow.now().format("YY_MM_DD") + plt.style.use(hep.style.ROOT) + + + df = df.sample(frac = 1).reset_index(drop=True) + + X = df[list(var)] + print(X) + print(X.info()) + + X_signal = df[var][df.target == 1] + X_bg = df[var][df.target == 0] + + y = df["target"] + print(y) + + y_signal = df["target"][df.target == 1] + y_bg = df["target"][df.target == 0] + + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + + from sklearn.preprocessing import StandardScaler + numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + + cat_cols = X.select_dtypes(exclude = "number").columns + num_cols = X.select_dtypes(include = "number").columns + + print(cat_cols) + print(num_cols) + + from sklearn.compose import ColumnTransformer + + full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + + X_processed = full_processor.fit_transform(X) + y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + + y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1)) + y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1)) + + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218) + X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218) + X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218) + #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) + print(X_train) + print(X_test) + print(y_train) + + + + + + pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, "full", versions_true) +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, 'zero', versions_true) + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}_bg_{bgs[bg_choice]}.json", kind, 'weak', versions_true) + +############################################################################################################################################################## + + + trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + + from sklearn.metrics import accuracy_score + +### Init classifier + xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10) + +### Fit + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + dtest = xgb.DMatrix(X_test, label = y_test) + dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig) + dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg) +#print(dtest) + dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)]) + dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):]) + model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, + model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000, + model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000, + sets = [dtrain, dval, dtest] + results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_weak = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_zero = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + + for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + y_preds_new_weak = model_xgb_weak.predict(sets[i]) + y_preds_new_zero = model_xgb_zero.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak) + fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + results_new_weak[ds]['fpr'].append(fpr_new_weak) + results_new_weak[ds]['tpr'].append(tpr_new_weak) + results_new_weak[ds]['thresholds'].append(thresholds_new_weak) + results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) + results_new_zero[ds]['fpr'].append(fpr_new_zero) + results_new_zero[ds]['tpr'].append(tpr_new_zero) + results_new_zero[ds]['thresholds'].append(thresholds_new_zero) + results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero)) + + + + pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true) + + pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true) + + xgb_cl.fit(X_train, y_train) + + print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### + preds = xgb_cl.predict(X_test) + + print(accuracy_score(y_test, preds)) + + print(y_test) + print(model_xgb.predict(dtest)) + print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) + predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)]) + predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)]) + + print(accuracy_score(y_test, predict_train)) + + from xgboost import plot_importance + from xgboost import plot_tree, to_graphviz + + #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) + #importances = importances.sort_values(by = "Importance", ascending = False) + #importances = importances.set_index('Feature') + #print(importances) + #importances.plot.bar() + ''' +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_{var}.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] +names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + ''' + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_{bgs[bg_choice]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False) + plt.title('Classifier output') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg_{bgs[bg_choice]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak_{bgs[bg_choice]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero_{bgs[bg_choice]}.jpg") + +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +with open(f"plot/{folder_save}/ROC.txt", "a") as myfile: + myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + " " + '\n') + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py new file mode 100644 index 0000000..c6d21b4 --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg.py @@ -0,0 +1,525 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_08' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"): + os.mkdir(f"./plot/{folder_save}/Diff_ROCs") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") +df = pd.read_csv('xgb_training_dataset_low_mumu.csv') + +roi = 'low_mumu' +learning_rate = 0.03 + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] +bg_choice = 1 +bg_choice_2 = 0 + +from itertools import combinations + +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_ind_array = np.arange(len(names_sig)) + +possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))] +#print(possible_combos) +print(possible_combos[1]) +#print(possible_combos[-1]) + + +length = [len(possible_el) for possible_el in possible_combos] +print(length) + +import random +sequence_list = np.arange(0,len(names_sig)) +#print(sequence_list) +random.shuffle(sequence_list) +print(sequence_list) + +interesting_combos = [] +combos = [] + +for i in range(0, len(length)): + #print([len(elem) for elem in possible_combos[i]]) + for j in range(0, len(possible_combos[i])): + #print(list(possible_combos[i][j])) + #print(list(sequence_list[:i])) + if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])): + print(sorted(list(possible_combos[i][j]))) + print(sorted(list(sequence_list[:(i+1)]))) + print(i, j) + combos.append([i,j]) + interesting_combos.append(sorted(list(possible_combos[i][j]))) + +print(combos) +#for k in range(0,len(combos)): + +print(interesting_combos) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type, var): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf") + +def pretty_ROC_Curve_var(results, kind, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf") + +def pretty_ROC_Curve_var_test_train_val(results, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_train = 'rgba(41, 128, 185, 1.0)' + c_line_test = 'rgba(58, 217, 19, 0.8)' + c_line_val = 'rgba(244, 70, 10, 0.8)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val} + fig_test = 0 + fig_train = 0 + fig_val = 0 + figs = {'test': fig_test, 'train': fig_train, 'val': fig_val} + for kind in ['test', 'val', 'train']: + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + colour = colours[kind] + + + figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}') + fig = go.Figure(data = [figs['test'], figs['train'], figs['val']]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + if not os.path.exists(f"plot/{folder_save}/ROC"): + os.mkdir(f"plot/{folder_save}/ROC") + + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + fig.write_image(f"plot/{folder_save}/ROC/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new_{bgs[bg_choice]}_{bgs[bg_choice_2]}.pdf") +############################################################################################################################### + +for versions in interesting_combos: + versions_true = [int(version) for version in versions] + versions = [True if value in versions_true else False for value in range(0, len(names_sig))] + print(versions) + print(np.array(names_sig)[versions]) + var = np.array(names_sig)[versions] + var = [f"{va}_{roi}" for va in var] + + time = arrow.now().format("YY_MM_DD") + plt.style.use(hep.style.ROOT) + + + df = df.sample(frac = 1).reset_index(drop=True) + + X = df[list(var)] + print(X) + print(X.info()) + + X_signal = df[var][df.target == 1] + X_bg = df[var][df.target == 0] + + y = df["target"] + print(y) + + y_signal = df["target"][df.target == 1] + y_bg = df["target"][df.target == 0] + + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + + from sklearn.preprocessing import StandardScaler + numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + + cat_cols = X.select_dtypes(exclude = "number").columns + num_cols = X.select_dtypes(include = "number").columns + + print(cat_cols) + print(num_cols) + + from sklearn.compose import ColumnTransformer + + full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + + X_processed = full_processor.fit_transform(X) + y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + + y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1)) + y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1)) + + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218) + X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218) + X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218) + #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) + print(X_train) + print(X_test) + print(y_train) + + + + + + pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, "full", versions_true) +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, 'zero', versions_true) + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{learning_rate}_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.json", kind, 'weak', versions_true) + +############################################################################################################################################################## + + + trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + + from sklearn.metrics import accuracy_score + +### Init classifier + xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10) + +### Fit + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + dtest = xgb.DMatrix(X_test, label = y_test) + dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig) + dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg) +#print(dtest) + dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)]) + dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):]) + model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, + model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000, + model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000, + sets = [dtrain, dval, dtest] + results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_weak = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_zero = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + + for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + y_preds_new_weak = model_xgb_weak.predict(sets[i]) + y_preds_new_zero = model_xgb_zero.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak) + fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + results_new_weak[ds]['fpr'].append(fpr_new_weak) + results_new_weak[ds]['tpr'].append(tpr_new_weak) + results_new_weak[ds]['thresholds'].append(thresholds_new_weak) + results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) + results_new_zero[ds]['fpr'].append(fpr_new_zero) + results_new_zero[ds]['tpr'].append(tpr_new_zero) + results_new_zero[ds]['thresholds'].append(thresholds_new_zero) + results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero)) + + + + pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true) + + pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true) + + xgb_cl.fit(X_train, y_train) + + print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### + preds = xgb_cl.predict(X_test) + + print(accuracy_score(y_test, preds)) + + print(y_test) + print(model_xgb.predict(dtest)) + print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) + predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)]) + predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)]) + + print(accuracy_score(y_test, predict_train)) + + from xgboost import plot_importance + from xgboost import plot_tree, to_graphviz + + #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) + #importances = importances.sort_values(by = "Importance", ascending = False) + #importances = importances.set_index('Feature') + #print(importances) + #importances.plot.bar() + ''' +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_{var}.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] +names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + ''' + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = 40, edgecolor = 'red', fill = False) + plt.title('Classifier output') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_weak), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_weak_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = 40, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_zero), bins = 40, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = 40, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/class_output_train_lr_{learning_rate}_{versions_true}_zero_{bgs[bg_choice]}_{bgs[bg_choice_2]}.jpg") + +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +with open(f"plot/{folder_save}/ROC.txt", "a") as myfile: + myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + " " + '\n') + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' diff --git a/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py new file mode 100644 index 0000000..d13dc52 --- /dev/null +++ b/xgb_test_only_xgb_reloaded_no_coffea_vars_bg_multibg_full_bg_set.py @@ -0,0 +1,589 @@ +from coffea.util import load +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt, mplhep as hep +import hist +import argparse, sys, os, arrow, glob, yaml +from matplotlib.offsetbox import AnchoredText +import xgboost as xgb +from hyperopt import STATUS_OK, Trials, fmin, hp, tpe +from sklearn.metrics import accuracy_score +from tqdm.notebook import tqdm +from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.model_selection import RepeatedKFold +import json + + +roi = 'low_ee' + +net_path = "/net/scratch_cms3a/vaulin/" +folder_save = 'eval_23_08_23_2' +if not os.path.exists(f"./plot/{folder_save}"): + os.mkdir(f"./plot/{folder_save}") +if not os.path.exists(f"./plot/{folder_save}/{roi}"): + os.mkdir(f"./plot/{folder_save}/{roi}") +if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs"): + os.mkdir(f"./plot/{folder_save}/Diff_ROCs") +if not os.path.exists(f"./plot/{folder_save}/Diff_ROCs/{roi}"): + os.mkdir(f"./plot/{folder_save}/Diff_ROCs/{roi}") +if not os.path.exists(net_path + f"plot/{folder_save}"): + os.mkdir(net_path + f"plot/{folder_save}") + +df = pd.read_csv(f'./plot/{folder_save}/xgb_training_dataset_{roi}.csv') + + +learning_rate = 0.1 +eta = 0.1 + +bgs = ['DY', "ZZ", "WZ", "tt", "ZHtobb"] + +from itertools import combinations + +names_sig = ['Higgs_mass', 'Higgs_pt', 'Z_pt', 'jjVptratio', 'CvsL_max', + 'CvsL_min', 'CvsB_max', 'CvsB_min', 'pt_lead', 'pt_sublead', + 'del_phi_jjV', 'del_R_jj', 'del_eta_jj', 'del_phi_jj', 'del_phi_ll', 'del_eta_ll', + 'del_phi_l2_subleading', 'del_phi_l2_leading'] + +names_ind_array = np.arange(len(names_sig)) + +possible_combos = [list(combinations(names_ind_array, i)) for i in range(1,len(names_sig))] +#print(possible_combos) +print(possible_combos[1]) +#print(possible_combos[-1]) + + +length = [len(possible_el) for possible_el in possible_combos] +print(length) + +import random +sequence_list = np.arange(0,len(names_sig)) +#print(sequence_list) +random.shuffle(sequence_list) +print(sequence_list) + +interesting_combos = [] +combos = [] + +for i in range(0, len(length)): + #print([len(elem) for elem in possible_combos[i]]) + for j in range(0, len(possible_combos[i])): + #print(list(possible_combos[i][j])) + #print(list(sequence_list[:i])) + if sorted(list(possible_combos[i][j])) == sorted(list(sequence_list[:(i+1)])): + print(sorted(list(possible_combos[i][j]))) + print(sorted(list(sequence_list[:(i+1)]))) + print(i, j) + combos.append([i,j]) + interesting_combos.append(sorted(list(possible_combos[i][j]))) + +print(combos) +#for k in range(0,len(combos)): + +print(interesting_combos) + +interesting_combos.append([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]) + +print(interesting_combos) + + +############################################################################################################################################### +################### Getting ROC curves from json files ######################################################################################## +############################################################################################################################################### +def convert(x): + if hasattr(x, "tolist"): + return x.tolist() + raise TypeError(x) + + +kind = 'val' +#kind = 'test' +#kind = 'train' + +def pretty_ROC_Curve(tr_set, kind, type, var): + + with open(tr_set) as user_file: + file_contents = user_file.read() + + results = json.loads(file_contents) + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(100): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0.2,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}.pdf") + +def pretty_ROC_Curve_var(results, kind, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_main = 'rgba(41, 128, 185, 1.0)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + + range_plot_x = [0,1] + range_plot_y = [0,1] + + import plotly.graph_objects as go + + + fig = go.Figure([go.Scatter(x = tpr_upper, y = 1 - fpr_mean, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'upper'), + go.Scatter(x = tpr_lower, y = 1 - fpr_mean, fill = 'tonexty', fillcolor = c_fill, line = dict(color = c_line, width = 1), hoverinfo = 'skip', showlegend = False, name = 'lower'), + go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = c_line_main, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}')]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/Diff_ROCs/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_kind_{kind}_{var}_new.pdf") + +def pretty_ROC_Curve_var_test_train_val(results, type, var): + + results = results + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + + c_fill = 'rgba(52, 152, 219, 0.2)' + c_line = 'rgba(52, 152, 219, 0.5)' + c_line_train = 'rgba(41, 128, 185, 1.0)' + c_line_test = 'rgba(58, 217, 19, 0.8)' + c_line_val = 'rgba(244, 70, 10, 0.8)' + c_grid = 'rgba(189, 195, 199, 0.5)' + c_annot = 'rgba(149, 165, 166, 0.5)' + c_highlight = 'rgba(192, 57, 43, 1.0)' + + fpr_mean = np.linspace(0, 1, 100) + + interp_tprs = [] + + range_plot_x = [0,1] + range_plot_y = [0,1] + + import plotly.graph_objects as go + colours = {'test':c_line_test, 'train': c_line_train, 'val': c_line_val} + fig_test = 0 + fig_train = 0 + fig_val = 0 + figs = {'test': fig_test, 'train': fig_train, 'val': fig_val} + for kind in ['test', 'val', 'train']: + for i in range(1): + fpr = results[kind]['fpr'][i] + tpr = results[kind]['tpr'][i] + interp_tpr = np.interp(fpr_mean, fpr, tpr) + interp_tpr[0] = 0.0 + interp_tprs.append(interp_tpr) + tpr_mean = np.mean(interp_tprs, axis = 0) + tpr_mean[-1] = 1.0 + tpr_std = 2*np.std(interp_tprs, axis = 0) + tpr_upper = np.clip(tpr_mean + tpr_std, 0, 1) + tpr_lower = tpr_mean - tpr_std + auc = np.mean(results[kind]['auc']) + colour = colours[kind] + + + figs[kind] = go.Scatter(x = tpr_mean, y = 1 - fpr_mean, line = dict(color = colour, width = 2), hoverinfo = 'skip', showlegend = True, name = f'AUC: {auc:.5f}, {kind}') + fig = go.Figure(data = [figs['test'], figs['train'], figs['val']]) + + fig.add_shape(type = 'line', line = dict(dash = 'dash'), x0 = 0, x1 = 1, y0 = 1, y1 = 0) + fig.update_layout(template = 'plotly_white', title_x = 0.5, xaxis_title = 'TPR (signal efficiency)', yaxis_title = '1 - FPR (Background rejection)', width = 800, height = 800, legend = dict( yanchor = 'bottom', xanchor = 'right', x = 0.95, y = 0.01,)) + fig.update_yaxes(range = range_plot_y, gridcolor = c_grid, scaleanchor = 'x', scaleratio = 1, linecolor = 'black') + fig.update_xaxes(range = range_plot_x, gridcolor = c_grid, constrain = 'domain', linecolor = 'black') + + if not os.path.exists(f"plot/{folder_save}/ROC"): + os.mkdir(f"plot/{folder_save}/ROC") + if not os.path.exists(f"plot/{folder_save}/ROC/{roi}"): + os.mkdir(f"plot/{folder_save}/ROC/{roi}") + + fig.write_image(f"plot/{folder_save}/ROC/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.jpg") + fig.write_image(f"plot/{folder_save}/ROC/{roi}/plotly_ROC_bg_rej_reloaded_{type}_lr_{learning_rate}_rangex_{range_plot_x}_rangey_{range_plot_y}_all_{var}_new.pdf") +############################################################################################################################### + +for versions in interesting_combos: + versions_true = [int(version) for version in versions] + versions = [True if value in versions_true else False for value in range(0, len(names_sig))] + print(versions) + print(np.array(names_sig)[versions]) + var = np.array(names_sig)[versions] + var = [f"{va}_{roi}" for va in var] + + time = arrow.now().format("YY_MM_DD") + plt.style.use(hep.style.ROOT) + + + df = df.sample(frac = 1).reset_index(drop=True) + + X = df[list(var)] + print(X) + print(X.info()) + + X_signal = df[var][df.target == 1] + X_bg = df[var][df.target == 0] + X_bg_dy = df[var][df.target_bg == 1] + X_bg_zz = df[var][df.target_bg == 2] + X_bg_wz = df[var][df.target_bg == 3] + X_bg_tt = df[var][df.target_bg == 4] + X_bg_zhtobb = df[var][df.target_bg == 5] + + y = df["target"] + print(y) + + y_signal = df["target"][df.target == 1] + y_bg = df["target"][df.target == 0] + y_bg_dy = df["target"][df.target_bg == 1] + y_bg_zz = df["target"][df.target_bg == 2] + y_bg_wz = df["target"][df.target_bg == 3] + y_bg_tt = df["target"][df.target_bg == 4] + y_bg_zhtobb = df["target"][df.target_bg == 5] + + from sklearn.impute import SimpleImputer + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import OneHotEncoder + + categorical_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "most_frequent")), ("oh-encode", OneHotEncoder(handle_unknown = "ignore", sparse = False)),]) + + from sklearn.preprocessing import StandardScaler + numeric_pipeline = Pipeline(steps = [("impute", SimpleImputer(strategy = "mean")), ("scale", StandardScaler())]) + + cat_cols = X.select_dtypes(exclude = "number").columns + num_cols = X.select_dtypes(include = "number").columns + + print(cat_cols) + print(num_cols) + + from sklearn.compose import ColumnTransformer + + full_processor = ColumnTransformer(transformers = [("numeric", numeric_pipeline, num_cols), ("categorical", categorical_pipeline, cat_cols),]) + + + + X_processed = full_processor.fit_transform(X) + y_processed = SimpleImputer(strategy = "most_frequent").fit_transform(y.values.reshape(-1,1)) + + y_processed_sig = SimpleImputer(strategy = "most_frequent").fit_transform(y_signal.values.reshape(-1,1)) + y_processed_bg = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg.values.reshape(-1,1)) + + y_processed_bg_dy = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_dy.values.reshape(-1,1)) + y_processed_bg_zz = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_zz.values.reshape(-1,1)) + y_processed_bg_wz = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_wz.values.reshape(-1,1)) + y_processed_bg_tt = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_tt.values.reshape(-1,1)) + y_processed_bg_zhtobb = SimpleImputer(strategy = "most_frequent").fit_transform(y_bg_zhtobb.values.reshape(-1,1)) + + from sklearn.model_selection import train_test_split + + X_train, X_test, y_train, y_test = train_test_split(X, y_processed, stratify = y_processed, random_state = 1121218) + X_train_sig, X_test_sig, y_train_sig, y_test_sig = train_test_split(X_signal, y_processed_sig, stratify = y_processed_sig, random_state = 1121218) + X_train_bg, X_test_bg, y_train_bg, y_test_bg = train_test_split(X_bg, y_processed_bg, stratify = y_processed_bg, random_state = 1121218) + X_train_bg_dy, X_test_bg_dy, y_train_bg_dy, y_test_bg_dy = train_test_split(X_bg_dy, y_processed_bg_dy, stratify = y_processed_bg_dy, random_state = 1121218) + X_train_bg_zz, X_test_bg_zz, y_train_bg_zz, y_test_bg_zz = train_test_split(X_bg_zz, y_processed_bg_zz, stratify = y_processed_bg_zz, random_state = 1121218) + X_train_bg_wz, X_test_bg_wz, y_train_bg_wz, y_test_bg_wz = train_test_split(X_bg_wz, y_processed_bg_wz, stratify = y_processed_bg_wz, random_state = 1121218) + X_train_bg_tt, X_test_bg_tt, y_train_bg_tt, y_test_bg_tt = train_test_split(X_bg_tt, y_processed_bg_tt, stratify = y_processed_bg_tt, random_state = 1121218) + X_train_bg_zhtobb, X_test_bg_zhtobb, y_train_bg_zhtobb, y_test_bg_zhtobb = train_test_split(X_bg_zhtobb, y_processed_bg_zhtobb, stratify = y_processed_bg_zhtobb, random_state = 1121218) + #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101, stratify = y) + print(X_train) + print(X_test) + print(y_train) + + + + + + pretty_ROC_Curve(f"plot/{folder_save}/results_lr_{roi}_{eta}_bg_full_bg_set.json", kind, "full", versions_true) +############################################################################################################################################################## +##################### Zero train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_zero_train_lr_{roi}_{eta}_bg_full_bg_set.json", kind, 'zero', versions_true) + +############################################################################################################################################################## +##################### Weak train ROC ######################################################################################################################### +############################################################################################################################################################## + + pretty_ROC_Curve(f"plot/{folder_save}/results_weak_train_lr_{roi}_{eta}_bg_full_bg_set.json", kind, 'weak', versions_true) + +############################################################################################################################################################## + + + trials = Trials() + +############################################################################################################################################################## +##################### Initiate the final training to be presented with the best parameters ################################################################### +############################################################################################################################################################## + + from sklearn.metrics import accuracy_score + +### Init classifier + xgb_cl = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.0292, gamma = 1.087, reg_alpha = 42.0, reg_lambda = 0.381, n_estimators = 200, max_depth = 8, subsample = 0.841, min_child_weight = 2.0, colsample_bytree = 0.994, scale_pos_weight = 10) + +### Fit + params = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss', 'eta': learning_rate} + metrics = ['auc', 'fpr', 'tpr', 'thresholds'] + dtest = xgb.DMatrix(X_test, label = y_test) + dtest_signal = xgb.DMatrix(X_test_sig, label = y_test_sig) + dtest_bg = xgb.DMatrix(X_test_bg, label = y_test_bg) + dtest_bg_dy = xgb.DMatrix(X_test_bg_dy, label = y_test_bg_dy) + dtest_bg_zz = xgb.DMatrix(X_test_bg_zz, label = y_test_bg_zz) + dtest_bg_wz = xgb.DMatrix(X_test_bg_wz, label = y_test_bg_wz) + dtest_bg_tt = xgb.DMatrix(X_test_bg_tt, label = y_test_bg_tt) + dtest_bg_zhtobb = xgb.DMatrix(X_test_bg_zhtobb, label = y_test_bg_zhtobb) +#print(dtest) + dtrain = xgb.DMatrix(X_train[:int(len(X_train)*0.8)], label = y_train[:int(len(y_train)*0.8)]) + dval = xgb.DMatrix(X_train[int(len(X_train)*0.8):], label = y_train[int(len(y_train)*0.8):]) + model_xgb = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 200) #num_boost_round = 1000, + model_xgb_weak = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 20) #num_boost_round = 1000, + model_xgb_zero = xgb.train(dtrain = dtrain, params = params, evals = [(dtrain, 'train'),(dval, 'dval')], + verbose_eval = 1, early_stopping_rounds = 30, num_boost_round = 2) #num_boost_round = 1000, + sets = [dtrain, dval, dtest] + results_new = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_weak = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + results_new_zero = {'train': {m:[] for m in metrics}, + 'val': {m:[] for m in metrics}, + 'test': {m:[] for m in metrics}} + params_new = {'objective' : 'binary:logistic', 'eval_metric' : 'logloss'} + + for i, ds in enumerate(results_new.keys()): + print(i) + y_preds_new = model_xgb.predict(sets[i]) + y_preds_new_weak = model_xgb_weak.predict(sets[i]) + y_preds_new_zero = model_xgb_zero.predict(sets[i]) + labels_new = sets[i].get_label() + fpr_new, tpr_new, thresholds_new = roc_curve(labels_new, y_preds_new) + fpr_new_weak, tpr_new_weak, thresholds_new_weak = roc_curve(labels_new, y_preds_new_weak) + fpr_new_zero, tpr_new_zero, thresholds_new_zero = roc_curve(labels_new, y_preds_new_zero) + results_new[ds]['fpr'].append(fpr_new) + results_new[ds]['tpr'].append(tpr_new) + results_new[ds]['thresholds'].append(thresholds_new) + results_new[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new)) + results_new_weak[ds]['fpr'].append(fpr_new_weak) + results_new_weak[ds]['tpr'].append(tpr_new_weak) + results_new_weak[ds]['thresholds'].append(thresholds_new_weak) + results_new_weak[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_weak)) + results_new_zero[ds]['fpr'].append(fpr_new_zero) + results_new_zero[ds]['tpr'].append(tpr_new_zero) + results_new_zero[ds]['thresholds'].append(thresholds_new_zero) + results_new_zero[ds]['auc'].append(roc_auc_score(labels_new, y_preds_new_zero)) + + + + pretty_ROC_Curve_var(results_new, 'test', 'full', versions_true) + + pretty_ROC_Curve_var_test_train_val(results_new, 'full', versions_true) + + xgb_cl.fit(X_train, y_train) + + print(xgb_cl) + +################################################################################################################################### +################################## Predict and give the final accuracy scores and importance plots ################################ +################################################################################################################################### + preds = xgb_cl.predict(X_test) + + print(accuracy_score(y_test, preds)) + + print(y_test) + print(model_xgb.predict(dtest)) + print(np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)])) + predict_train = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb.predict(dtest)]) + predict_train_weak = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_weak.predict(dtest)]) + predict_train_zero = np.array([1 if dtest_val > 0.5 else 0 for dtest_val in model_xgb_zero.predict(dtest)]) + + print(accuracy_score(y_test, predict_train)) + + from xgboost import plot_importance + from xgboost import plot_tree, to_graphviz + + #importances = pd.DataFrame({'Feature': X.select_dtypes(include = "number").columns, 'Importance': xgb_cl.feature_importances_}) + #importances = importances.sort_values(by = "Importance", ascending = False) + #importances = importances.set_index('Feature') + #print(importances) + #importances.plot.bar() + ''' +fig, ax = plt.subplots(figsize=(17,12)) +plot_importance(xgb_cl, fmap = 'feature_map_var.txt', ax = ax) +plt.xlabel('Feature scores') +plt.ylabel("Feature names") +plt.title('Importance plot') +plt.legend(['']) +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_{var}.jpg") + + +feature_importance = model_xgb.get_score(importance_type = 'weight') +keys = list(feature_importance.keys()) +names_sig = ['m(H)', '$p_t$(H)', '$p_t$(Z)', '$\\frac{p_t(V)}{p_t(H)}$', '$CvsL_{max}$', + '$CvsL_{min}$', '$CvsB_{max}$', '$CvsB_{min}$', '$p_t$ of $CvsL_{max}$ jet', '$p_t$ of $CvsL_{min}$ jet', + '$\Delta\Phi(V, H)$', '$\Delta R(jet_1, jet_2)$', '$\Delta\eta(jet_1, jet_2)$', '$\Delta\Phi(jet_1, jet_2)$', '$\Delta\Phi(l_1, l_2)$', '$\Delta\eta(l_1, l_2)$', + '$\Delta\Phi (l_{subleading}, jet_{subleading})$', '$\Delta\Phi (l_{subleading}, jet_{leading})$'] +names_sig = ['$\Delta\Phi (l_{subleading}, jet_{subleading})$'] +values = list(feature_importance.values()) +data = pd.DataFrame(data = values, index = names_sig, columns = ['score']).sort_values(by = 'score', ascending = False) +print(data) +print(data.index) + + +fig = plt.figure(figsize=(17,12)) +ax1 = fig.add_subplot(1,2,1) +ax1.set_axis_off() +ax2 = fig.add_subplot(1,2,2) +ax2.barh(list(reversed(data.index)), list(reversed(data.score))) +ax2.set_xlabel('Feature scores') +ax2.set_ylabel("Feature names") +ax2.set_title('Importance plot') +#plt.show() +plt.savefig(f"plot/{folder_save}/importance_train_lr_{learning_rate}_{var}.jpg") + + +plt.figure(figsize=(17,12)) +plot_tree(xgb_cl, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + +plt.figure(figsize=(17,12)) +plot_tree(model_xgb, fmap = 'feature_map_var.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_lr_{learning_rate}_{var}.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show() + ''' + bins = np.array([0 + i*((1-0)/25) for i in range(0, 26)]) + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train), bins = bins, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'blue',fill = False) + plt.hist(np.array(model_xgb.predict(dtest_bg)), bins = bins, edgecolor = 'red', fill = False) + plt.title('Classifier output') + plt.legend(['Signal', 'Background']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_sig_vs_bg.jpg") + + plt.figure(figsize=(17,12)) + + plt.hist([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))], bins = bins, color=['g', 'y', 'b', 'm', 'c'], stacked = True, alpha = 0.5) + plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'red', fill = False) + plt.title('Classifier output') + plt.legend(['DY', 'ZZ', 'WZ', "tt", "ZHtobb", 'Signal']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_lr_{learning_rate}_{versions_true}_sig_vs_bg_alls.jpg") + + lumi = 41480 + xsec_weights = [6077.*(lumi/102863931), 3.74*(lumi/19134840), 6.419*(lumi/18136498), 88.51*(lumi/105859990), 0.00720*(lumi/4337504)] + + #pred array = [np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))] + #print(pred_array) + plt.figure(figsize=(17,12)) + + plt.hist([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))], bins = bins, weights = [xsec_weights[i]*np.array([1]*len([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))][i])) for i in range(0,len([np.array(model_xgb.predict(dtest_bg_dy)), np.array(model_xgb.predict(dtest_bg_zz)), np.array(model_xgb.predict(dtest_bg_wz)), np.array(model_xgb.predict(dtest_bg_tt)), np.array(model_xgb.predict(dtest_bg_zhtobb))]))], color=['g', 'y', 'b', 'm', 'c'], stacked = True, alpha = 0.5) + plt.hist(np.array(model_xgb.predict(dtest_signal)), bins = bins, edgecolor = 'red', fill = False) + plt.title('Classifier output') + plt.legend(['DY', 'ZZ', 'WZ', "tt", "ZHtobb", 'Signal']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_scaled_lr_{learning_rate}_{versions_true}_sig_vs_bg_alls.jpg") + + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_weak.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_weak), bins = bins, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_weak.jpg") + + plt.figure(figsize=(17,12)) + plt.hist(np.array(model_xgb_zero.predict(dtest)), bins = bins, edgecolor = 'blue',fill = False) + plt.hist(np.array(predict_train_zero), bins = bins, edgecolor = 'green', hatch = '/', fill = False) + plt.hist(y_test, bins = bins, facecolor = 'orange', edgecolor = 'orange', fill = False) + plt.title('Classifier output') + plt.legend(['Train output', 'Train output after threshold','Test data']) + #plt.show() + plt.savefig(f"plot/{folder_save}/{roi}/class_output_train_lr_{learning_rate}_{versions_true}_zero.jpg") + +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()''' + +with open(f"plot/{folder_save}/ROC.txt", "a") as myfile: + myfile.write(f"ROC score for {var}: " + str(accuracy_score(y_test, predict_train)) + " " + '\n') + +''' +plt.figure(figsize=(17,12)) +to_graphviz(model_xgb, fmap = 'feature_map.txt') +plt.title('Decision tree graph') +#plt.show() +plt.savefig(f"plot/{folder_save}/boost_tree_train_graphviz.jpg", dpi = 1800) +###result = 1/(1+np.exp(leaf_value))) for belonging to calss 1 +#plt.show()'''