From 7c8d900a667e81f577694bbdb5e6862031606cb3 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 30 Jul 2021 15:36:11 +0200
Subject: [PATCH 01/40] Inital cluster code commit

---
 .gitignore                   | 138 ++++++++++++++++++++
 CBM_ML_XGB_package.py        | 213 ------------------------------
 The CBM ML-XGB package.ipynb | 246 -----------------------------------
 clusterXGB.py                | 192 +++++++++++++++++++++++++++
 4 files changed, 330 insertions(+), 459 deletions(-)
 create mode 100644 .gitignore
 delete mode 100644 CBM_ML_XGB_package.py
 delete mode 100644 The CBM ML-XGB package.ipynb
 create mode 100644 clusterXGB.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a81c8ee
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/CBM_ML_XGB_package.py b/CBM_ML_XGB_package.py
deleted file mode 100644
index bb56c20..0000000
--- a/CBM_ML_XGB_package.py
+++ /dev/null
@@ -1,213 +0,0 @@
-#We import some libraries
-import numpy as np
-import pandas as pd
-import matplotlib.pyplot as plt
-import xgboost as xgb
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import confusion_matrix
-from sklearn.model_selection import RandomizedSearchCV, cross_val_score
-from scipy.stats import uniform
-import weakref 
-from bayes_opt import BayesianOptimization
-from data_cleaning import clean_df
-from KFPF_lambda_cuts import KFPF_lambda_cuts
-from plot_tools import AMS, preds_prob, plot_confusion_matrix
-import gc
-from tree_importer import tree_importer
-
-#To save some memory we will delete unused variables
-class TestClass(object): 
-    def check(self): 
-        print ("object is alive!") 
-    def __del__(self): 
-        print ("object deleted") 
-        
-#to paralellize some part of the code
-from concurrent.futures import ThreadPoolExecutor
-executor = ThreadPoolExecutor(8)
-
-
-# We import three root files into our jupyter notebook
-signal= tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeSignal.root','PlainTree')
-
-# We only select lambda candidates
-sgnal = signal[(signal['LambdaCandidates_is_signal']==1) & (signal['LambdaCandidates_mass']>1.108)
-               & (signal['LambdaCandidates_mass']<1.1227)]
-
-# Similarly for the background
-background = tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeBackground.root','PlainTree')
-bg = background[(background['LambdaCandidates_is_signal'] == 0)
-                & ((background['LambdaCandidates_mass'] > 1.07)
-                & (background['LambdaCandidates_mass'] < 1.108) | (background['LambdaCandidates_mass']>1.1227) 
-                   & (background['LambdaCandidates_mass'] < 2.00))]
-
-#delete unused variables
-del signal
-del background
-
-#we also import a 10k events data set, generated using URQMD with AuAu collisions at 12AGeV
-file = tree_importer('/home/shahid/cbmsoft/Data/10k_events_PFSimplePlainTree.root','PlainTree')
-#Call the python garbage collector to clean up things
-gc.collect()
-df_original= pd.DataFrame(data=file)
-del file
-
-#The labels of the columns in the df data frame are having the prefix LambdaCandidates_ so we rename them
-new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',
-       'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',
-       'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity',
-             'x', 'y', 'z', 'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal']
-
-sgnal.columns = new_labels
-bg.columns = new_labels
-df_original.columns=new_labels
-
-# Next we clean the data using the clean_df function saved in another .py file
-
-#Creating a new data frame and saving the results in it after cleaning of the original dfs
-#Also keeping the original one
-bcknd = clean_df(bg)
-signal = clean_df(sgnal)
-
-del bg
-del sgnal
-gc.collect()
-
-df_clean = clean_df(df_original)
-del df_original
-gc.collect()
-
-# We randomly choose our signal set of 4000 candidates
-signal_selected= signal.sample(n=90000)
-
-#background = 3 times the signal is also done randomly
-background_selected = bcknd
-
-del signal
-del bcknd
-
-#Let's combine signal and background
-dfs = [signal_selected, background_selected]
-df_scaled = pd.concat(dfs)
-# Let's shuffle the rows randomly
-df_scaled = df_scaled.sample(frac=1)
-del dfs
-
-
-# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not
-cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo']
-x = df_scaled[cuts].copy()
-
-# The MC information is saved in this y variable
-y =pd.DataFrame(df_scaled['issignal'], dtype='int')
-
-#We do the same for the 10k events data set
-x_whole = df_clean[cuts].copy()
-y_whole = pd.DataFrame(df_clean['issignal'], dtype='int')
-
-
-#Creating a train and test set from the signal and background combined data set
-x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
-
-dtrain = xgb.DMatrix(x_train, label = y_train)
-dtest = xgb.DMatrix(x_whole, label = y_whole)
-dtest1=xgb.DMatrix(x_test, label = y_test)
-
-#Bayesian Optimization function for xgboost
-#specify the parameters you want to tune as keyword arguments
-def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
-    params = {'max_depth': int(max_depth),
-              'gamma': gamma,
-              'alpha':alpha,
-              'n_estimators': n_estimators,
-              'learning_rate':learning_rate,
-              'subsample': 0.8,
-              'eta': 0.1,
-              'eval_metric': 'auc', 'nthread' : 7}
-    cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5)
-    return  cv_result['test-auc-mean'].iloc[-1]
-
-
-#Invoking the Bayesian Optimizer with the specified parameters to tune
-xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),
-                                             'gamma': (0, 1),
-                                            'alpha': (2,20),
-                                             'learning_rate':(0,1),
-                                             'n_estimators':(100,500)
-                                            })
-
-#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
-xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')
-
-max_param = xgb_bo.max['params']
-param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'binary:logistic'}
-
-bst = xgb.train(param, dtrain)
-bst1= bst.predict(dtrain)
-
-bst_test = pd.DataFrame(data=bst.predict(dtest1),  columns=["xgb_preds"])
-y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
-bst_test['issignal']=y_test['issignal']
-
-#To visualize the predictions of the classifier in terms of probabilities
-#The first argument should be a data frame, the second a column in it, in the form 'preds'
-preds_prob(bst_test,'xgb_preds', 'issignal')
-
-#To choose the best threshold
-train_best, test_best = AMS(y_train, bst1,y_test, bst_test['xgb_preds'])
-
-#Applying XGB on the 10k events data-set
-df_clean['xgb_preds'] = bst.predict(dtest)
-
-#plot confusion matrix
-cut1 = test_best
-df_clean['xgb_preds1'] = ((df_clean['xgb_preds']>cut1)*1)
-cnf_matrix = confusion_matrix(y_whole, df_clean['xgb_preds1'], labels=[1,0])
-#cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0])
-np.set_printoptions(precision=2)
-fig, axs = plt.subplots(figsize=(10, 8))
-axs.yaxis.set_label_coords(-0.04,.5)
-axs.xaxis.set_label_coords(0.5,-.005)
-plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1))
-
-
-#comparison with manual cuts
-#returns a new df 
-new_check_set=KFPF_lambda_cuts(df_clean)
-
-
-cut3 = test_best
-mask1 = df_clean['xgb_preds']>cut3
-df3_base=df_clean[mask1]
-from matplotlib import gridspec
-range1= (1.0999, 1.17)
-fig, axs = plt.subplots(2, 1,figsize=(15,10), sharex=True,  gridspec_kw={'width_ratios': [10],
-                           'height_ratios': [8,4]})
-
-ns, bins, patches=axs[0].hist((df3_base['mass']),bins = 300, range=range1, facecolor='red',alpha = 0.3)
-ns1, bins1, patches1=axs[0].hist((new_check_set['mass']),bins = 300, range=range1,facecolor='blue',alpha = 0.3)
-
-axs[0].set_ylabel("counts", fontsize = 15)
-axs[0].legend(('XGBoost Selected $\Lambda$s','KFPF selected $\Lambda$s'), fontsize = 15, loc='upper right')
-
-#plt.rcParams["legend.loc"] = 'upper right'
-axs[0].set_title("The lambda's Invariant Mass histogram with KFPF and XGB selection criteria on KFPF variables", fontsize = 15)
-axs[0].grid()
-axs[0].tick_params(axis='both', which='major', labelsize=15)
-
-
-hist1, bin_edges1 = np.histogram(df3_base['mass'],range=(1.09, 1.17), bins=300)
-hist2, bin_edges2 = np.histogram(new_check_set['mass'],range=(1.09, 1.17), bins=300)
-
-#makes sense to have only positive values 
-diff = (hist1 - hist2)
-axs[1].bar(bins[:-1],     # this is what makes it comparable
-        ns / ns1, # maybe check for div-by-zero!
-        width=0.001)
-plt.xlabel("Mass in $\dfrac{GeV}{c^2}$", fontsize = 15)
-axs[1].set_ylabel("XGB / KFPF", fontsize = 15)
-axs[1].grid()
-axs[1].tick_params(axis='both', which='major', labelsize=15)
-
-plt.show()
-fig.tight_layout()
diff --git a/The CBM ML-XGB package.ipynb b/The CBM ML-XGB package.ipynb
deleted file mode 100644
index 92302d9..0000000
--- a/The CBM ML-XGB package.ipynb	
+++ /dev/null
@@ -1,246 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#We import some libraries\n",
-    "import numpy as np\n",
-    "import pandas as pd\n",
-    "import matplotlib.pyplot as plt\n",
-    "import xgboost as xgb\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.metrics import confusion_matrix\n",
-    "from sklearn.model_selection import RandomizedSearchCV, cross_val_score\n",
-    "from scipy.stats import uniform\n",
-    "import weakref \n",
-    "from bayes_opt import BayesianOptimization\n",
-    "from data_cleaning import clean_df\n",
-    "from KFPF_lambda_cuts import KFPF_lambda_cuts\n",
-    "from plot_tools import AMS, preds_prob, plot_confusion_matrix\n",
-    "import gc\n",
-    "from tree_importer import tree_importer\n",
-    "\n",
-    "#To save some memory we will delete unused variables\n",
-    "class TestClass(object): \n",
-    "    def check(self): \n",
-    "        print (\"object is alive!\") \n",
-    "    def __del__(self): \n",
-    "        print (\"object deleted\") \n",
-    "        \n",
-    "#to paralellize some part of the code\n",
-    "from concurrent.futures import ThreadPoolExecutor\n",
-    "executor = ThreadPoolExecutor(8)\n",
-    "\n",
-    "\n",
-    "# We import three root files into our jupyter notebook\n",
-    "signal= tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeSignal.root','PlainTree')\n",
-    "\n",
-    "# We only select lambda candidates\n",
-    "sgnal = signal[(signal['LambdaCandidates_is_signal']==1) & (signal['LambdaCandidates_mass']>1.108)\n",
-    "               & (signal['LambdaCandidates_mass']<1.1227)]\n",
-    "\n",
-    "# Similarly for the background\n",
-    "background = tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeBackground.root','PlainTree')\n",
-    "bg = background[(background['LambdaCandidates_is_signal'] == 0)\n",
-    "                & ((background['LambdaCandidates_mass'] > 1.07)\n",
-    "                & (background['LambdaCandidates_mass'] < 1.108) | (background['LambdaCandidates_mass']>1.1227) \n",
-    "                   & (background['LambdaCandidates_mass'] < 2.00))]\n",
-    "\n",
-    "#delete unused variables\n",
-    "del signal\n",
-    "del background\n",
-    "\n",
-    "#we also import a 10k events data set, generated using URQMD with AuAu collisions at 12AGeV\n",
-    "file = tree_importer('/home/shahid/cbmsoft/Data/10k_events_PFSimplePlainTree.root','PlainTree')\n",
-    "#Call the python garbage collector to clean up things\n",
-    "gc.collect()\n",
-    "df_original= pd.DataFrame(data=file)\n",
-    "del file\n",
-    "\n",
-    "#The labels of the columns in the df data frame are having the prefix LambdaCandidates_ so we rename them\n",
-    "new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',\n",
-    "       'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',\n",
-    "       'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity',\n",
-    "             'x', 'y', 'z', 'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal']\n",
-    "\n",
-    "sgnal.columns = new_labels\n",
-    "bg.columns = new_labels\n",
-    "df_original.columns=new_labels\n",
-    "\n",
-    "# Next we clean the data using the clean_df function saved in another .py file\n",
-    "\n",
-    "#Creating a new data frame and saving the results in it after cleaning of the original dfs\n",
-    "#Also keeping the original one\n",
-    "bcknd = clean_df(bg)\n",
-    "signal = clean_df(sgnal)\n",
-    "\n",
-    "del bg\n",
-    "del sgnal\n",
-    "gc.collect()\n",
-    "\n",
-    "df_clean = clean_df(df_original)\n",
-    "del df_original\n",
-    "gc.collect()\n",
-    "\n",
-    "# We randomly choose our signal set of 4000 candidates\n",
-    "signal_selected= signal.sample(n=90000)\n",
-    "\n",
-    "#background = 3 times the signal is also done randomly\n",
-    "background_selected = bcknd\n",
-    "\n",
-    "del signal\n",
-    "del bcknd\n",
-    "\n",
-    "#Let's combine signal and background\n",
-    "dfs = [signal_selected, background_selected]\n",
-    "df_scaled = pd.concat(dfs)\n",
-    "# Let's shuffle the rows randomly\n",
-    "df_scaled = df_scaled.sample(frac=1)\n",
-    "del dfs\n",
-    "\n",
-    "\n",
-    "# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not\n",
-    "cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo']\n",
-    "x = df_scaled[cuts].copy()\n",
-    "\n",
-    "# The MC information is saved in this y variable\n",
-    "y =pd.DataFrame(df_scaled['issignal'], dtype='int')\n",
-    "\n",
-    "#We do the same for the 10k events data set\n",
-    "x_whole = df_clean[cuts].copy()\n",
-    "y_whole = pd.DataFrame(df_clean['issignal'], dtype='int')\n",
-    "\n",
-    "\n",
-    "#Creating a train and test set from the signal and background combined data set\n",
-    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)\n",
-    "\n",
-    "dtrain = xgb.DMatrix(x_train, label = y_train)\n",
-    "dtest = xgb.DMatrix(x_whole, label = y_whole)\n",
-    "dtest1=xgb.DMatrix(x_test, label = y_test)\n",
-    "\n",
-    "#Bayesian Optimization function for xgboost\n",
-    "#specify the parameters you want to tune as keyword arguments\n",
-    "def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):\n",
-    "    params = {'max_depth': int(max_depth),\n",
-    "              'gamma': gamma,\n",
-    "              'alpha':alpha,\n",
-    "              'n_estimators': n_estimators,\n",
-    "              'learning_rate':learning_rate,\n",
-    "              'subsample': 0.8,\n",
-    "              'eta': 0.1,\n",
-    "              'eval_metric': 'auc', 'nthread' : 7}\n",
-    "    cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5)\n",
-    "    return  cv_result['test-auc-mean'].iloc[-1]\n",
-    "\n",
-    "\n",
-    "#Invoking the Bayesian Optimizer with the specified parameters to tune\n",
-    "xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),\n",
-    "                                             'gamma': (0, 1),\n",
-    "                                            'alpha': (2,20),\n",
-    "                                             'learning_rate':(0,1),\n",
-    "                                             'n_estimators':(100,500)\n",
-    "                                            })\n",
-    "\n",
-    "#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement\n",
-    "xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')\n",
-    "\n",
-    "max_param = xgb_bo.max['params']\n",
-    "param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'binary:logistic'}\n",
-    "\n",
-    "bst = xgb.train(param, dtrain)\n",
-    "bst1= bst.predict(dtrain)\n",
-    "\n",
-    "bst_test = pd.DataFrame(data=bst.predict(dtest1),  columns=[\"xgb_preds\"])\n",
-    "y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))\n",
-    "bst_test['issignal']=y_test['issignal']\n",
-    "\n",
-    "#To visualize the predictions of the classifier in terms of probabilities\n",
-    "#The first argument should be a data frame, the second a column in it, in the form 'preds'\n",
-    "preds_prob(bst_test,'xgb_preds', 'issignal')\n",
-    "\n",
-    "#To choose the best threshold\n",
-    "train_best, test_best = AMS(y_train, bst1,y_test, bst_test['xgb_preds'])\n",
-    "\n",
-    "#Applying XGB on the 10k events data-set\n",
-    "df_clean['xgb_preds'] = bst.predict(dtest)\n",
-    "\n",
-    "#plot confusion matrix\n",
-    "cut1 = test_best\n",
-    "df_clean['xgb_preds1'] = ((df_clean['xgb_preds']>cut1)*1)\n",
-    "cnf_matrix = confusion_matrix(y_whole, df_clean['xgb_preds1'], labels=[1,0])\n",
-    "#cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0])\n",
-    "np.set_printoptions(precision=2)\n",
-    "fig, axs = plt.subplots(figsize=(10, 8))\n",
-    "axs.yaxis.set_label_coords(-0.04,.5)\n",
-    "axs.xaxis.set_label_coords(0.5,-.005)\n",
-    "plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1))\n",
-    "\n",
-    "\n",
-    "#comparison with manual cuts\n",
-    "#returns a new df \n",
-    "new_check_set=KFPF_lambda_cuts(df_clean)\n",
-    "\n",
-    "\n",
-    "cut3 = test_best\n",
-    "mask1 = df_clean['xgb_preds']>cut3\n",
-    "df3_base=df_clean[mask1]\n",
-    "from matplotlib import gridspec\n",
-    "range1= (1.0999, 1.17)\n",
-    "fig, axs = plt.subplots(2, 1,figsize=(15,10), sharex=True,  gridspec_kw={'width_ratios': [10],\n",
-    "                           'height_ratios': [8,4]})\n",
-    "\n",
-    "ns, bins, patches=axs[0].hist((df3_base['mass']),bins = 300, range=range1, facecolor='red',alpha = 0.3)\n",
-    "ns1, bins1, patches1=axs[0].hist((new_check_set['mass']),bins = 300, range=range1,facecolor='blue',alpha = 0.3)\n",
-    "\n",
-    "axs[0].set_ylabel(\"counts\", fontsize = 15)\n",
-    "axs[0].legend(('XGBoost Selected $\\Lambda$s','KFPF selected $\\Lambda$s'), fontsize = 15, loc='upper right')\n",
-    "\n",
-    "#plt.rcParams[\"legend.loc\"] = 'upper right'\n",
-    "axs[0].set_title(\"The lambda's Invariant Mass histogram with KFPF and XGB selection criteria on KFPF variables\", fontsize = 15)\n",
-    "axs[0].grid()\n",
-    "axs[0].tick_params(axis='both', which='major', labelsize=15)\n",
-    "\n",
-    "\n",
-    "hist1, bin_edges1 = np.histogram(df3_base['mass'],range=(1.09, 1.17), bins=300)\n",
-    "hist2, bin_edges2 = np.histogram(new_check_set['mass'],range=(1.09, 1.17), bins=300)\n",
-    "\n",
-    "#makes sense to have only positive values \n",
-    "diff = (hist1 - hist2)\n",
-    "axs[1].bar(bins[:-1],     # this is what makes it comparable\n",
-    "        ns / ns1, # maybe check for div-by-zero!\n",
-    "        width=0.001)\n",
-    "plt.xlabel(\"Mass in $\\dfrac{GeV}{c^2}$\", fontsize = 15)\n",
-    "axs[1].set_ylabel(\"XGB / KFPF\", fontsize = 15)\n",
-    "axs[1].grid()\n",
-    "axs[1].tick_params(axis='both', which='major', labelsize=15)\n",
-    "\n",
-    "plt.show()\n",
-    "fig.tight_layout()"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/clusterXGB.py b/clusterXGB.py
new file mode 100644
index 0000000..2044305
--- /dev/null
+++ b/clusterXGB.py
@@ -0,0 +1,192 @@
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+import matplotlib.pyplot as plt
+
+from library.CBM_ML.tree_importer import tree_importer
+
+from sklearn.model_selection import train_test_split
+
+from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score
+
+from sklearn.tree import plot_tree
+from sklearn.tree import export_graphviz
+
+from sklearn.model_selection import cross_val_score
+from scipy.stats import uniform
+
+from numpy import sqrt, log, argmax
+
+from bayes_opt import BayesianOptimization
+
+import gc
+
+
+signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
+df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
+
+tree_name = 'PlainTree'
+
+signal= tree_importer(signal_path,tree_name, 3)
+df_urqmd = tree_importer(df_urqmd_path, tree_name,3)
+
+background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
+                             ((df_urqmd['mass'] > 1.07) &\
+                             (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) &\
+                             (df_urqmd['mass'] < 1.3))]
+
+
+#Let's combine signal and background
+df_scaled = pd.concat([signal, background_selected])
+# Let's shuffle the rows randomly
+df_scaled = df_scaled.sample(frac=1)
+# Let's take a look at the top 10 entries of the df
+df_scaled.iloc[0:10,:]
+del signal, background_selected
+
+
+# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not
+# cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo']
+
+cuts = [ 'chi2primneg', 'chi2primpos']
+
+x = df_scaled[cuts].copy()
+
+# The MC information is saved in this y variable
+y =pd.DataFrame(df_scaled['issignal'], dtype='int')
+
+x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
+
+#DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
+dtrain = xgb.DMatrix(x_train, label = y_train)
+dtest1=xgb.DMatrix(x_test, label = y_test)
+
+
+#Bayesian Optimization function for xgboost
+#specify the parameters you want to tune as keyword arguments
+def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
+    params = {'max_depth': int(max_depth),
+              'gamma': gamma,
+              'alpha':alpha,
+              'n_estimators': n_estimators,
+              'learning_rate':learning_rate,
+              'subsample': 0.8,
+              'eta': 0.3,
+              'eval_metric': 'auc', 'nthread' : 7}
+    cv_result = xgb.cv(params, dtrain, num_boost_round=10, nfold=5)
+    return  cv_result['test-auc-mean'].iloc[-1]
+
+
+#Invoking the Bayesian Optimizer with the specified parameters to tune
+xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),
+                                             'gamma': (0, 1),
+                                            'alpha': (2,20),
+                                             'learning_rate':(0,1),
+                                             'n_estimators':(100,500)
+                                            })
+
+
+#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
+xgb_bo.maximize(n_iter=5, init_points=5)
+
+
+max_param = xgb_bo.max['params']
+param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'reg:logistic'}
+gc.collect()
+
+
+#To train the algorithm using the parameters selected by bayesian optimization
+#Fit/train on training data
+bst = xgb.train(param, dtrain)
+
+#predicitions on training set
+bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False),  columns=["xgb_preds"])
+y_train=y_train.set_index(np.arange(0,bst_train.shape[0]))
+bst_train['issignal']=y_train['issignal']
+
+
+#predictions on test set
+bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False),  columns=["xgb_preds"])
+y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
+bst_test['issignal']=y_test['issignal']
+
+#The following graph will show us that which features are important for the model
+ax = xgb.plot_importance(bst)
+plt.rcParams['figure.figsize'] = [5, 3]
+plt.show()
+ax.figure.tight_layout()
+#ax.figure.savefig("hits.png")
+
+#ROC cures for the predictions on train and test sets
+train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
+
+#The first argument should be a data frame, the second a column in it, in the form 'preds'
+plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
+
+#To save some memory on colab we delete some unused variables
+del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
+gc.collect()
+
+
+x_whole_1 = df_urqmd[cuts].copy()
+y_whole_1 = pd.DataFrame(df_urqmd['issignal'], dtype='int')
+dtest2 = xgb.DMatrix(x_whole_1, label = y_whole_1)
+df_urqmd['xgb_preds'] = bst.predict(dtest2, output_margin=False)
+
+del x_whole_1, y_whole_1, dtest2
+gc.collect()
+
+x_whole = df_dcm[cuts].copy()
+y_whole = pd.DataFrame(df_dcm['issignal'], dtype='int')
+#DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
+dtest = xgb.DMatrix(x_whole, label = y_whole)
+del x_whole, y_whole
+df_dcm['xgb_preds'] = bst.predict(dtest, output_margin=False)
+del dtest
+gc.collect()
+
+
+#lets take the best threshold and look at the confusion matrix
+cut1 = test_best
+df_dcm['xgb_preds1'] = ((df_dcm['xgb_preds']>cut1)*1)
+cnf_matrix = confusion_matrix(df_dcm['issignal'], df_dcm['xgb_preds1'], labels=[1,0])
+np.set_printoptions(precision=2)
+fig, axs = plt.subplots(figsize=(10, 8))
+axs.yaxis.set_label_coords(-0.04,.5)
+axs.xaxis.set_label_coords(0.5,-.005)
+plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1))
+#plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
+
+
+plot_tools.cut_visualization(df_urqmd,'xgb_preds',test_best)
+
+xgb.to_graphviz(bst, fmap='', num_trees=0, rankdir=None, yes_color=None, no_color=None, condition_node_params=None, leaf_node_params=None)
+
+new_check_set= df_urqmd.copy()
+new_check_set['new_signal']=0
+mask1 = (new_check_set['chi2primpos'] > 18.4) & (new_check_set['chi2primneg'] > 18.4)
+
+mask2 = (new_check_set['ldl'] > 5) & (new_check_set['distance'] < 1)
+
+mask3 = (new_check_set['chi2geo'] < 3)
+
+new_check_set = new_check_set[(mask1) & (mask2) & (mask3)]
+
+#After all these cuts, what is left is considered as signal, so we replace all the values in the 'new_signal'
+# column by 1
+new_check_set['new_signal'] = 1
+cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0])
+np.set_printoptions(precision=2)
+fig, axs = plt.subplots(figsize=(10, 8))
+axs.yaxis.set_label_coords(-0.04,.5)
+axs.xaxis.set_label_coords(0.5,-.005)
+plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for KFPF')
+
+cut3 = test_best
+mask1 = df_original['xgb_preds']>cut3
+df3=df_original[mask1]
+
+plot_tools.comaprison_XGB_KFPF(df3['mass'],new_check_set['mass'])
+
+del x,y,x_test,y_test,x_whole,y_whole,dtest,dtrain,dtest1,df3,df_clean,df_scaled
+gc.collect()

From 493f824c3aab23f9cbf91a0f9fe1a0855ecba708 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 30 Jul 2021 15:53:20 +0200
Subject: [PATCH 02/40] Parralel part rehardcoded

---
 clusterXGB.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 2044305..89d62a1 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -21,14 +21,18 @@
 
 import gc
 
-
+# Put here your path and your tree
 signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
 df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
 
 tree_name = 'PlainTree'
 
-signal= tree_importer(signal_path,tree_name, 3)
-df_urqmd = tree_importer(df_urqmd_path, tree_name,3)
+
+# How many threads we use to parallel code
+number_of_threads = 3
+
+signal= tree_importer(signal_path,tree_name, number_of_threads)
+df_urqmd = tree_importer(df_urqmd_path, tree_name, number_of_threads)
 
 background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
                              ((df_urqmd['mass'] > 1.07) &\

From dafe099050fbafd27c5511320fe6b8ce0ba452c6 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Sun, 1 Aug 2021 13:11:51 +0200
Subject: [PATCH 03/40] Transformed code to procedure style

---
 clusterXGB.py | 224 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 147 insertions(+), 77 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 89d62a1..9f8fa08 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -25,47 +25,96 @@
 signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
 df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
 
+df_dcm_path = '/home/olha/CBM/dataset10k_tree/dcm_prim_100k_cleaned.root'
+
 tree_name = 'PlainTree'
 
 
 # How many threads we use to parallel code
 number_of_threads = 3
 
-signal= tree_importer(signal_path,tree_name, number_of_threads)
-df_urqmd = tree_importer(df_urqmd_path, tree_name, number_of_threads)
 
-background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
-                             ((df_urqmd['mass'] > 1.07) &\
-                             (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) &\
-                             (df_urqmd['mass'] < 1.3))]
+def data_selection(signal_path, bgr_path, tree, threads):
+
+    """
+    We have selected signal candidates only from the DCM model, therefore, we are
+    treating it as simulated data. The URQMD data set will be treated as real
+    experimental data.
+
+    Our URQMD 100k events data, which looks more like what we will get from the
+    final experiment, has a lot more background than signal. This problem of
+    unequal ratio of classes (signal and background) in our data set (URQMD,
+     99.99% background and less than 1% signal) is called imbalance classification problem.
+
+    One of the solutions to tackle this problem is resampling the data.
+    Deleting instances from the over-represented class (in our case the background),
+    under-sampling, is a resampling method.
+
+    So for training and testing we will get signal candidates from the DCM signal
+    and background from URQMD (3 times signal size).
 
+    Parameters
+    ----------
+    signal_path: str
+          path to signal file
+    bgr_path: str
+          path to background file
+    tree: str
+          name of flat tree
+    threads: int
+            how many parallel thread we want
+    """
+    signal= tree_importer(signal_path,tree_name, threads)
+    df_urqmd = tree_importer(bgr_path, tree_name, threads)
 
-#Let's combine signal and background
-df_scaled = pd.concat([signal, background_selected])
-# Let's shuffle the rows randomly
-df_scaled = df_scaled.sample(frac=1)
-# Let's take a look at the top 10 entries of the df
-df_scaled.iloc[0:10,:]
-del signal, background_selected
+    background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
+                                 ((df_urqmd['mass'] > 1.07) &\
+                                 (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) &\
+                                 (df_urqmd['mass'] < 1.3))]
 
+    df_scaled = pd.concat([signal, background_selected])
+    df_scaled = df_scaled.sample(frac=1)
+    df_scaled.iloc[0:10,:]
+    del signal, background_selected
 
-# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not
-# cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo']
+    return df_scaled
 
+
+df_scaled = data_selection(signal_path, df_urqmd_path, tree_name, 4)
+df_urqmd = tree_importer(df_dcm_path, tree_name, number_of_threads)
 cuts = [ 'chi2primneg', 'chi2primpos']
 
-x = df_scaled[cuts].copy()
 
-# The MC information is saved in this y variable
-y =pd.DataFrame(df_scaled['issignal'], dtype='int')
+def train_test_set(df_scaled, cuts):
+    """
+    To make machine learning algorithms more efficient on unseen data we divide
+    our data into two sets. One set is for training the algorithm and the other
+    is for testing the algorithm. If we don't do this then the algorithm can
+    overfit and we will not capture the general trends in the data.
+
+    Parameters
+    ----------
+    df_scaled: dataframe_
+          dataframe with mixed signal and bacjground
+    cuts: list(contains strings)
+          cuts
+    """
+    x = df_scaled[cuts].copy()
+
+    # The MC information is saved in this y variable
+    y =pd.DataFrame(df_scaled['issignal'], dtype='int')
+    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
 
-x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
+    #DMatrix is a internal data structure that used by XGBoost
+    # which is optimized for both memory efficiency and training speed.
+    dtrain = xgb.DMatrix(x_train, label = y_train)
+    dtest1=xgb.DMatrix(x_test, label = y_test)
 
-#DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
-dtrain = xgb.DMatrix(x_train, label = y_train)
-dtest1=xgb.DMatrix(x_test, label = y_test)
+    return dtrain, dtest1
 
 
+dtrain, dtest = train_test_set(df_scaled, cuts)
+
 #Bayesian Optimization function for xgboost
 #specify the parameters you want to tune as keyword arguments
 def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
@@ -81,74 +130,95 @@ def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
     return  cv_result['test-auc-mean'].iloc[-1]
 
 
-#Invoking the Bayesian Optimizer with the specified parameters to tune
-xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),
-                                             'gamma': (0, 1),
-                                            'alpha': (2,20),
-                                             'learning_rate':(0,1),
-                                             'n_estimators':(100,500)
-                                            })
-
 
-#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
-xgb_bo.maximize(n_iter=5, init_points=5)
-
-
-max_param = xgb_bo.max['params']
-param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'reg:logistic'}
-gc.collect()
-
-
-#To train the algorithm using the parameters selected by bayesian optimization
-#Fit/train on training data
-bst = xgb.train(param, dtrain)
+def get_best_params():
+    """
+    Performs Bayesian Optimization and looks for the best parameters
+
+    Parameters:
+           None
+    """
+    #Invoking the Bayesian Optimizer with the specified parameters to tune
+    xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),
+                                                 'gamma': (0, 1),
+                                                'alpha': (2,20),
+                                                 'learning_rate':(0,1),
+                                                 'n_estimators':(100,500)
+                                                })
+    #performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
+    xgb_bo.maximize(n_iter=1, init_points=1)
+
+    max_param = xgb_bo.max['params']
+    param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'],
+     'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)),
+      'objective': 'reg:logistic'}
+    gc.collect()
+
+
+    #To train the algorithm using the parameters selected by bayesian optimization
+    #Fit/train on training data
+    bst = xgb.train(param, dtrain)
+    return bst
+
+
+# bst = get_best_params()
+#
+# #predicitions on training set
+# bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False),  columns=["xgb_preds"])
+# y_train=y_train.set_index(np.arange(0,bst_train.shape[0]))
+# bst_train['issignal']=y_train['issignal']
+#
+#
+# #predictions on test set
+# bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False),  columns=["xgb_preds"])
+# y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
+# bst_test['issignal']=y_test['issignal']
+#
+#
+# #The following graph will show us that which features are important for the model
+# ax = xgb.plot_importance(bst)
+# plt.rcParams['figure.figsize'] = [5, 3]
+# plt.show()
+# ax.figure.tight_layout()
+#ax.figure.savefig("hits.png")
 
-#predicitions on training set
-bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False),  columns=["xgb_preds"])
-y_train=y_train.set_index(np.arange(0,bst_train.shape[0]))
-bst_train['issignal']=y_train['issignal']
 
 
-#predictions on test set
-bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False),  columns=["xgb_preds"])
-y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
-bst_test['issignal']=y_test['issignal']
 
-#The following graph will show us that which features are important for the model
-ax = xgb.plot_importance(bst)
-plt.rcParams['figure.figsize'] = [5, 3]
-plt.show()
-ax.figure.tight_layout()
-#ax.figure.savefig("hits.png")
+# #ROC cures for the predictions on train and test sets
+# train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
+#
+# #The first argument should be a data frame, the second a column in it, in the form 'preds'
+# plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
+#
+# #To save some memory on colab we delete some unused variables
+# del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
+# gc.collect()
 
-#ROC cures for the predictions on train and test sets
-train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
 
-#The first argument should be a data frame, the second a column in it, in the form 'preds'
-plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
 
-#To save some memory on colab we delete some unused variables
-del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
-gc.collect()
 
+def whole_dataset(df_urqmd):
+    x_whole_1 = df_urqmd[cuts].copy()
+    y_whole_1 = pd.DataFrame(df_urqmd['issignal'], dtype='int')
+    dtest2 = xgb.DMatrix(x_whole_1, label = y_whole_1)
+    df_urqmd['xgb_preds'] = bst.predict(dtest2, output_margin=False)
 
-x_whole_1 = df_urqmd[cuts].copy()
-y_whole_1 = pd.DataFrame(df_urqmd['issignal'], dtype='int')
-dtest2 = xgb.DMatrix(x_whole_1, label = y_whole_1)
-df_urqmd['xgb_preds'] = bst.predict(dtest2, output_margin=False)
+    del x_whole_1, y_whole_1, dtest2
+    gc.collect()
 
-del x_whole_1, y_whole_1, dtest2
-gc.collect()
+    x_whole = df_dcm[cuts].copy()
+    y_whole = pd.DataFrame(df_dcm['issignal'], dtype='int')
+    #DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
+    dtest = xgb.DMatrix(x_whole, label = y_whole)
+    del x_whole, y_whole
+    df_dcm['xgb_preds'] = bst.predict(dtest, output_margin=False)
+    del dtest
+    gc.collect()
 
-x_whole = df_dcm[cuts].copy()
-y_whole = pd.DataFrame(df_dcm['issignal'], dtype='int')
-#DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
-dtest = xgb.DMatrix(x_whole, label = y_whole)
-del x_whole, y_whole
-df_dcm['xgb_preds'] = bst.predict(dtest, output_margin=False)
-del dtest
-gc.collect()
+    return df_dcm
 
+whole_dataset(df_urqmd)
 
 #lets take the best threshold and look at the confusion matrix
 cut1 = test_best

From 0767fb062faaa1adc0d033f6186de513575a3ecb Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Sun, 1 Aug 2021 19:00:54 +0200
Subject: [PATCH 04/40] Tranformed everything till comparison with KFPF to
 procedure style

---
 clusterXGB.py | 157 ++++++++++++++++++++++++++------------------------
 1 file changed, 83 insertions(+), 74 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 9f8fa08..5321ae1 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -24,7 +24,6 @@
 # Put here your path and your tree
 signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
 df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
-
 df_dcm_path = '/home/olha/CBM/dataset10k_tree/dcm_prim_100k_cleaned.root'
 
 tree_name = 'PlainTree'
@@ -80,8 +79,10 @@ def data_selection(signal_path, bgr_path, tree, threads):
     return df_scaled
 
 
-df_scaled = data_selection(signal_path, df_urqmd_path, tree_name, 4)
-df_urqmd = tree_importer(df_dcm_path, tree_name, number_of_threads)
+df_scaled = data_selection(signal_path, df_dcm_path, tree_name,
+ number_of_threads)
+df_dcm = tree_importer(df_dcm, tree_name, number_of_threads)
+df_urqmd = tree_importer(df_urqmd_path, tree_name, number_of_threads)
 cuts = [ 'chi2primneg', 'chi2primpos']
 
 
@@ -94,10 +95,10 @@ def train_test_set(df_scaled, cuts):
 
     Parameters
     ----------
-    df_scaled: dataframe_
+    df_scaled: dataframe
           dataframe with mixed signal and bacjground
     cuts: list(contains strings)
-          cuts
+          features on which training is based
     """
     x = df_scaled[cuts].copy()
 
@@ -145,7 +146,8 @@ def get_best_params():
                                                  'learning_rate':(0,1),
                                                  'n_estimators':(100,500)
                                                 })
-    #performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement
+    #performing Bayesian optimization for 5 iterations with 8 steps of random exploration
+    # with an #acquisition function of expected improvement
     xgb_bo.maximize(n_iter=1, init_points=1)
 
     max_param = xgb_bo.max['params']
@@ -161,44 +163,59 @@ def get_best_params():
     return bst
 
 
-# bst = get_best_params()
-#
-# #predicitions on training set
-# bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False),  columns=["xgb_preds"])
-# y_train=y_train.set_index(np.arange(0,bst_train.shape[0]))
-# bst_train['issignal']=y_train['issignal']
-#
-#
-# #predictions on test set
-# bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False),  columns=["xgb_preds"])
-# y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
-# bst_test['issignal']=y_test['issignal']
-#
-#
-# #The following graph will show us that which features are important for the model
-# ax = xgb.plot_importance(bst)
-# plt.rcParams['figure.figsize'] = [5, 3]
-# plt.show()
-# ax.figure.tight_layout()
-#ax.figure.savefig("hits.png")
+bst = get_best_params()
+
+#predicitions on training set
+bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False),  columns=["xgb_preds"])
+y_train=y_train.set_index(np.arange(0,bst_train.shape[0]))
+bst_train['issignal']=y_train['issignal']
+
+
+#predictions on test set
+bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False),  columns=["xgb_preds"])
+y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))
+bst_test['issignal']=y_test['issignal']
 
 
+#The following graph will show us that which features are important for the model
+ax = xgb.plot_importance(bst)
+plt.rcParams['figure.figsize'] = [5, 3]
+plt.show()
+ax.figure.tight_layout()
+# ax.figure.savefig("hits.png")
 
 
-# #ROC cures for the predictions on train and test sets
-# train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
-#
-# #The first argument should be a data frame, the second a column in it, in the form 'preds'
-# plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
-#
-# #To save some memory on colab we delete some unused variables
-# del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
-# gc.collect()
 
 
+#ROC cures for the predictions on train and test sets
+train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
+
+#The first argument should be a data frame, the second a column in it, in the form 'preds'
+plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
+
+#To save some memory on colab we delete some unused variables
+del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
+gc.collect()
 
 
-def whole_dataset(df_urqmd):
+
+
+def whole_dataset(df_urqmd,df_dcm, cuts):
+    """
+    Makes predictions and adds it to dataset. df_dcm is dataset which wasn't used
+    by model, so let's have a look what predictions we have for real data
+
+    Parameters
+    ----------
+    df_urqmd: dataframe
+          100k events data set
+    df_dcm: dataframe
+          100k events data set(unknown data)
+    cuts: list(contains strings)
+          features on which training is based
+
+    """
+
     x_whole_1 = df_urqmd[cuts].copy()
     y_whole_1 = pd.DataFrame(df_urqmd['issignal'], dtype='int')
     dtest2 = xgb.DMatrix(x_whole_1, label = y_whole_1)
@@ -216,51 +233,43 @@ def whole_dataset(df_urqmd):
     del dtest
     gc.collect()
 
-    return df_dcm
-
-whole_dataset(df_urqmd)
-
-#lets take the best threshold and look at the confusion matrix
-cut1 = test_best
-df_dcm['xgb_preds1'] = ((df_dcm['xgb_preds']>cut1)*1)
-cnf_matrix = confusion_matrix(df_dcm['issignal'], df_dcm['xgb_preds1'], labels=[1,0])
-np.set_printoptions(precision=2)
-fig, axs = plt.subplots(figsize=(10, 8))
-axs.yaxis.set_label_coords(-0.04,.5)
-axs.xaxis.set_label_coords(0.5,-.005)
-plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1))
-#plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
+    return df_urqmd,df_dcm
 
 
-plot_tools.cut_visualization(df_urqmd,'xgb_preds',test_best)
 
-xgb.to_graphviz(bst, fmap='', num_trees=0, rankdir=None, yes_color=None, no_color=None, condition_node_params=None, leaf_node_params=None)
+whole_dataset(df_urqmd,df_dcm, cuts)
 
-new_check_set= df_urqmd.copy()
-new_check_set['new_signal']=0
-mask1 = (new_check_set['chi2primpos'] > 18.4) & (new_check_set['chi2primneg'] > 18.4)
 
-mask2 = (new_check_set['ldl'] > 5) & (new_check_set['distance'] < 1)
 
-mask3 = (new_check_set['chi2geo'] < 3)
 
-new_check_set = new_check_set[(mask1) & (mask2) & (mask3)]
-
-#After all these cuts, what is left is considered as signal, so we replace all the values in the 'new_signal'
-# column by 1
-new_check_set['new_signal'] = 1
-cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0])
-np.set_printoptions(precision=2)
-fig, axs = plt.subplots(figsize=(10, 8))
-axs.yaxis.set_label_coords(-0.04,.5)
-axs.xaxis.set_label_coords(0.5,-.005)
-plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for KFPF')
+def CM_plot(test_best, df_dcm):
+    """
+    Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
+    the number of observations known to be in group i and predicted to be in
+    group j. Thus in binary classification, the count of true positives is C00,
+    false negatives C01,false positives is C10, and true neagtives is C11.
 
-cut3 = test_best
-mask1 = df_original['xgb_preds']>cut3
-df3=df_original[mask1]
+    Confusion matrix is applied to previously unseen by model data, so we can
+    estimate model's performance
 
-plot_tools.comaprison_XGB_KFPF(df3['mass'],new_check_set['mass'])
+    Parameters
+    ----------
+    test_best:
 
-del x,y,x_test,y_test,x_whole,y_whole,dtest,dtrain,dtest1,df3,df_clean,df_scaled
-gc.collect()
+    df_dcm: dataframe
+          100k events data set(unknown data)
+    """
+    #lets take the best threshold and look at the confusion matrix
+    cut1 = test_best
+    df_dcm['xgb_preds1'] = ((df_dcm['xgb_preds']>cut1)*1)
+    cnf_matrix = confusion_matrix(df_dcm['issignal'], df_dcm['xgb_preds1'], labels=[1,0])
+    np.set_printoptions(precision=2)
+    fig, axs = plt.subplots(figsize=(10, 8))
+    axs.yaxis.set_label_coords(-0.04,.5)
+    axs.xaxis.set_label_coords(0.5,-.005)
+    plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'],
+     title='Confusion Matrix for XGB for cut > '+str(cut1))
+    #plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
+
+
+CM_plot(test_best, df_dcm)

From f29c2a68a9e81a62e95eb81868040e6c9d0d1652 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Tue, 3 Aug 2021 17:55:49 +0200
Subject: [PATCH 05/40] Corrected df_dcm_path

---
 clusterXGB.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 5321ae1..d2fea76 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -81,7 +81,7 @@ def data_selection(signal_path, bgr_path, tree, threads):
 
 df_scaled = data_selection(signal_path, df_dcm_path, tree_name,
  number_of_threads)
-df_dcm = tree_importer(df_dcm, tree_name, number_of_threads)
+df_dcm = tree_importer(df_dcm_path, tree_name, number_of_threads)
 df_urqmd = tree_importer(df_urqmd_path, tree_name, number_of_threads)
 cuts = [ 'chi2primneg', 'chi2primpos']
 

From d7cebba4247d3ee06b54f5d334f053edc77e5f18 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Thu, 5 Aug 2021 15:06:55 +0200
Subject: [PATCH 06/40] 1. PNG was added to gitignore 2. preliminary cluster
 code is implemented 3. Nan rejection was added to tree_importer's quality
 cuts

---
 .gitignore                      |  3 ++
 clusterXGB.py                   | 88 +++++++++------------------------
 library/CBM_ML/tree_importer.py | 34 ++++++-------
 3 files changed, 43 insertions(+), 82 deletions(-)

diff --git a/.gitignore b/.gitignore
index a81c8ee..7491591 100644
--- a/.gitignore
+++ b/.gitignore
@@ -136,3 +136,6 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+# PNG images
+*.png
diff --git a/clusterXGB.py b/clusterXGB.py
index d2fea76..a18b2f8 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -4,6 +4,7 @@
 import matplotlib.pyplot as plt
 
 from library.CBM_ML.tree_importer import tree_importer
+from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix
 
 from sklearn.model_selection import train_test_split
 
@@ -24,7 +25,6 @@
 # Put here your path and your tree
 signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
 df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
-df_dcm_path = '/home/olha/CBM/dataset10k_tree/dcm_prim_100k_cleaned.root'
 
 tree_name = 'PlainTree'
 
@@ -79,13 +79,13 @@ def data_selection(signal_path, bgr_path, tree, threads):
     return df_scaled
 
 
-df_scaled = data_selection(signal_path, df_dcm_path, tree_name,
+df_scaled = data_selection(signal_path, df_urqmd_path, tree_name,
  number_of_threads)
-df_dcm = tree_importer(df_dcm_path, tree_name, number_of_threads)
-df_urqmd = tree_importer(df_urqmd_path, tree_name, number_of_threads)
+
 cuts = [ 'chi2primneg', 'chi2primpos']
 
 
+
 def train_test_set(df_scaled, cuts):
     """
     To make machine learning algorithms more efficient on unseen data we divide
@@ -111,10 +111,13 @@ def train_test_set(df_scaled, cuts):
     dtrain = xgb.DMatrix(x_train, label = y_train)
     dtest1=xgb.DMatrix(x_test, label = y_test)
 
-    return dtrain, dtest1
+    return dtrain, dtest1,x_train, y_train, y_test
 
 
-dtrain, dtest = train_test_set(df_scaled, cuts)
+dtrain, dtest1,x_train, y_train, y_test = train_test_set(df_scaled, cuts)
+
+del df_scaled
+gc.collect()
 
 #Bayesian Optimization function for xgboost
 #specify the parameters you want to tune as keyword arguments
@@ -179,70 +182,26 @@ def get_best_params():
 
 #The following graph will show us that which features are important for the model
 ax = xgb.plot_importance(bst)
-plt.rcParams['figure.figsize'] = [5, 3]
+plt.rcParams['figure.figsize'] = [6, 3]
 plt.show()
 ax.figure.tight_layout()
-# ax.figure.savefig("hits.png")
-
-
-
+ax.figure.savefig("hits.png")
 
+#
+#
+#
 #ROC cures for the predictions on train and test sets
-train_best, test_best = plot_tools.AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
+train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
 
 #The first argument should be a data frame, the second a column in it, in the form 'preds'
-plot_tools.preds_prob(bst_test,'xgb_preds', 'issignal','test')
-
-#To save some memory on colab we delete some unused variables
-del dtrain, dtest1, x_train, x_test, y_train, y_test, df_scaled
-gc.collect()
-
-
-
-
-def whole_dataset(df_urqmd,df_dcm, cuts):
-    """
-    Makes predictions and adds it to dataset. df_dcm is dataset which wasn't used
-    by model, so let's have a look what predictions we have for real data
-
-    Parameters
-    ----------
-    df_urqmd: dataframe
-          100k events data set
-    df_dcm: dataframe
-          100k events data set(unknown data)
-    cuts: list(contains strings)
-          features on which training is based
-
-    """
-
-    x_whole_1 = df_urqmd[cuts].copy()
-    y_whole_1 = pd.DataFrame(df_urqmd['issignal'], dtype='int')
-    dtest2 = xgb.DMatrix(x_whole_1, label = y_whole_1)
-    df_urqmd['xgb_preds'] = bst.predict(dtest2, output_margin=False)
-
-    del x_whole_1, y_whole_1, dtest2
-    gc.collect()
-
-    x_whole = df_dcm[cuts].copy()
-    y_whole = pd.DataFrame(df_dcm['issignal'], dtype='int')
-    #DMatrix is a internal data structure that used by XGBoost which is optimized for both memory efficiency and training speed.
-    dtest = xgb.DMatrix(x_whole, label = y_whole)
-    del x_whole, y_whole
-    df_dcm['xgb_preds'] = bst.predict(dtest, output_margin=False)
-    del dtest
-    gc.collect()
-
-    return df_urqmd,df_dcm
-
+preds_prob(bst_test,'xgb_preds', 'issignal','test')
 
 
-whole_dataset(df_urqmd,df_dcm, cuts)
 
 
 
 
-def CM_plot(test_best, df_dcm):
+def CM_plot(test_best, x_train):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
     the number of observations known to be in group i and predicted to be in
@@ -256,20 +215,19 @@ def CM_plot(test_best, df_dcm):
     ----------
     test_best:
 
-    df_dcm: dataframe
-          100k events data set(unknown data)
+    x_train:
     """
     #lets take the best threshold and look at the confusion matrix
     cut1 = test_best
-    df_dcm['xgb_preds1'] = ((df_dcm['xgb_preds']>cut1)*1)
-    cnf_matrix = confusion_matrix(df_dcm['issignal'], df_dcm['xgb_preds1'], labels=[1,0])
+    x_train['xgb_preds1'] = ((x_train['xgb_preds']>cut1)*1)
+    cnf_matrix = confusion_matrix(x_train['issignal'], x_train['xgb_preds1'], labels=[1,0])
     np.set_printoptions(precision=2)
     fig, axs = plt.subplots(figsize=(10, 8))
     axs.yaxis.set_label_coords(-0.04,.5)
     axs.xaxis.set_label_coords(0.5,-.005)
-    plot_tools.plot_confusion_matrix(cnf_matrix, classes=['signal','background'],
+    plot_confusion_matrix(cnf_matrix, classes=['signal','background'],
      title='Confusion Matrix for XGB for cut > '+str(cut1))
-    #plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
+    plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
 
 
-CM_plot(test_best, df_dcm)
+CM_plot(test_best, bst_train)
diff --git a/library/CBM_ML/tree_importer.py b/library/CBM_ML/tree_importer.py
index a159ee3..f34bd24 100644
--- a/library/CBM_ML/tree_importer.py
+++ b/library/CBM_ML/tree_importer.py
@@ -7,13 +7,13 @@
 """
 The tree_importer function takes in flat analysis tree and returns a pandas data-frame object. It has 3 inputs, the first one is
 path of the analysis tree, the second one as the tree name and third one the number of CPU cores. The first and second input
-should be inserted as strings i.e. path inside a single quotation '' or double quotations "". The third input should be a number. 
+should be inserted as strings i.e. path inside a single quotation '' or double quotations "". The third input should be a number.
 For example  tree_importer("/home/flat_trees/a.tree","PlainTree",4)
 """
-def tree_importer(path,treename, n):    
+def tree_importer(path,treename, n):
     #The number of parallel processors
     executor = ThreadPoolExecutor(n)
-    
+
     #To open the root file and convert it to a pandas dataframe
     file = uproot.open(path+':'+treename, library='pd', decompression_executor=executor,
                                   interpretation_executor=executor).arrays(library='np',decompression_executor=executor,
@@ -28,19 +28,19 @@ def tree_importer(path,treename, n):
 def quality_cuts_plus_other_cuts_lambda():
     #The following quality selection criteria is applied
     mass_cut = '('+labels[10]+' > 1.07) &'
-    
+
     coordinate_cut = '('+labels[20]+'>-50) & ('+labels[20]+'<50) & ('+labels[21]+'>-50) & ('+labels[21]+'<50) & ('+labels[22]+'>-1) & ('+labels[22]+'<80) &'
-    
+
     chi_2_positive_cut ='('+labels[0]+'>0) & ('+labels[8]+'>0) & ('+labels[2]+'>0) & ('+labels[1]+' > 0) &'
-    
+
     distance_cut = '('+labels[4]+'>0) & ('+labels[5]+'<80) & ('+labels[3]+'>0) & ('+labels[3]+'<100) &'
-    
+
     pz_cut = '('+labels[14]+'>0) & '
     #Other cuts
     pseudo_rapidity_cut_based_on_acceptance = '('+labels[19]+'>1) & ('+labels[19]+'<6.5) &'
-    
+
     angular_cut = '('+labels[6]+'>0.1) & ('+labels[7]+'>0.1) &'
-    
+
     data_reducing_cut = '('+labels[10]+'< 1.3) &  ('+labels[15]+'<20)  &   ('+labels[0]+' < 1000) &  ('+labels[2]+'<1e6) & ('+labels[1]+ '< 3e7) &  ('+labels[4]+'<5000) & ('+labels[8]+'< 100000)'
 
     cuts= mass_cut+coordinate_cut+chi_2_positive_cut+distance_cut+pz_cut+pseudo_rapidity_cut_based_on_acceptance+angular_cut+data_reducing_cut
@@ -66,15 +66,15 @@ def labels_lambda(file):
 """
 
 def tree_importer_with_cuts(path,treename, n):
-    
-    #This part changes the labels of the root tree's branches 
-    
+
+    #This part changes the labels of the root tree's branches
+
     new_labels=['chi2geo', 'chi2primneg','chi2primpos', 'distance', 'ldl','mass', 'pT', 'rapidity','issignal']
-    
+
     #The number of parallel processors
     executor = ThreadPoolExecutor(n)
-    
-    #To open the 
+
+    #To open the
     file = uproot.open(path+':'+treename, library='pd', decompression_executor=executor,
                                   interpretation_executor=executor)
     labels = labels_lambda(file)
@@ -88,6 +88,6 @@ def tree_importer_with_cuts(path,treename, n):
     #df['issignal']=((df['issignal']>0)*1)
     with pd.option_context('mode.use_inf_as_na', True):
         df = df.dropna()
-    return df
-
 
+    df = df.dropna()
+    return df

From 734bbd4a0765466507be57eba7bda396da21afa9 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Thu, 5 Aug 2021 15:53:09 +0200
Subject: [PATCH 07/40] Corrected some comments to code

---
 clusterXGB.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index a18b2f8..f0e67eb 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -22,7 +22,7 @@
 
 import gc
 
-# Put here your path and your tree
+# Put here your signal and background path and your tree as well
 signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
 df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
 
@@ -82,6 +82,7 @@ def data_selection(signal_path, bgr_path, tree, threads):
 df_scaled = data_selection(signal_path, df_urqmd_path, tree_name,
  number_of_threads)
 
+# features to be trained
 cuts = [ 'chi2primneg', 'chi2primpos']
 
 
@@ -96,7 +97,7 @@ def train_test_set(df_scaled, cuts):
     Parameters
     ----------
     df_scaled: dataframe
-          dataframe with mixed signal and bacjground
+          dataframe with mixed signal and background
     cuts: list(contains strings)
           features on which training is based
     """
@@ -187,9 +188,7 @@ def get_best_params():
 ax.figure.tight_layout()
 ax.figure.savefig("hits.png")
 
-#
-#
-#
+
 #ROC cures for the predictions on train and test sets
 train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
 
@@ -198,9 +197,6 @@ def get_best_params():
 
 
 
-
-
-
 def CM_plot(test_best, x_train):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
@@ -213,9 +209,11 @@ def CM_plot(test_best, x_train):
 
     Parameters
     ----------
-    test_best:
+    test_best: numpy.float32
+              best threshold
 
-    x_train:
+    x_train: dataframe
+            we want to get confusion matrix on training datasets
     """
     #lets take the best threshold and look at the confusion matrix
     cut1 = test_best

From 5261092a2f1c430455cebf91a206b98e4f0d9c94 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 9 Aug 2021 12:43:49 +0200
Subject: [PATCH 08/40] User chooses signal, background and output path by
 command line arguments. For example python3 clusterXGB.py
 /signal/path/fileS.root /bachground/path.fileB.root output/path

---
 clusterXGB.py                   | 41 +++++++++++----
 library/CBM_ML/plot_tools.py    | 46 ++++++++---------
 library/CBM_ML/tree_importer.py | 89 +++++++++++----------------------
 3 files changed, 83 insertions(+), 93 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index f0e67eb..102e9ab 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -3,7 +3,7 @@
 import xgboost as xgb
 import matplotlib.pyplot as plt
 
-from library.CBM_ML.tree_importer import tree_importer
+from library.CBM_ML.tree_importer import tree_importer, new_labels, quality_cuts
 from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix
 
 from sklearn.model_selection import train_test_split
@@ -22,16 +22,27 @@
 
 import gc
 
-# Put here your signal and background path and your tree as well
-signal_path = '/home/olha/CBM/dataset10k_tree/dcm_1m_prim_signal.root'
-df_urqmd_path = '/home/olha/CBM/dataset10k_tree/urqmd_100k_cleaned.root'
+import sys
+import os
+
 
 tree_name = 'PlainTree'
 
+path_list = []
 
 # How many threads we use to parallel code
 number_of_threads = 3
 
+for x in sys.argv[1:]:
+    path_list.append(x)
+
+signal_path = path_list[0]
+df_urqmd_path = path_list[1]
+
+output_path = path_list[2]
+
+if not os.path.exists(output_path):
+    os.mkdir(output_path)
 
 def data_selection(signal_path, bgr_path, tree, threads):
 
@@ -66,12 +77,19 @@ def data_selection(signal_path, bgr_path, tree, threads):
     signal= tree_importer(signal_path,tree_name, threads)
     df_urqmd = tree_importer(bgr_path, tree_name, threads)
 
+    signal = new_labels(signal)
+    df_urqmd = new_labels(df_urqmd)
+
+    signal = quality_cuts(signal)
+    df_urqmd = quality_cuts(df_urqmd)
+
+    signal_selected = signal[signal['issignal']==1]
     background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
                                  ((df_urqmd['mass'] > 1.07) &\
                                  (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) &\
                                  (df_urqmd['mass'] < 1.3))]
 
-    df_scaled = pd.concat([signal, background_selected])
+    df_scaled = pd.concat([signal_selected, background_selected])
     df_scaled = df_scaled.sample(frac=1)
     df_scaled.iloc[0:10,:]
     del signal, background_selected
@@ -82,6 +100,7 @@ def data_selection(signal_path, bgr_path, tree, threads):
 df_scaled = data_selection(signal_path, df_urqmd_path, tree_name,
  number_of_threads)
 
+
 # features to be trained
 cuts = [ 'chi2primneg', 'chi2primpos']
 
@@ -186,18 +205,18 @@ def get_best_params():
 plt.rcParams['figure.figsize'] = [6, 3]
 plt.show()
 ax.figure.tight_layout()
-ax.figure.savefig("hits.png")
+ax.figure.savefig(str(output_path)+"/hits.png")
 
 
 #ROC cures for the predictions on train and test sets
-train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'])
+train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'], output_path)
 
 #The first argument should be a data frame, the second a column in it, in the form 'preds'
-preds_prob(bst_test,'xgb_preds', 'issignal','test')
+preds_prob(bst_test,'xgb_preds', 'issignal','test', output_path)
 
 
 
-def CM_plot(test_best, x_train):
+def CM_plot(test_best, x_train, output_path):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
     the number of observations known to be in group i and predicted to be in
@@ -225,7 +244,7 @@ def CM_plot(test_best, x_train):
     axs.xaxis.set_label_coords(0.5,-.005)
     plot_confusion_matrix(cnf_matrix, classes=['signal','background'],
      title='Confusion Matrix for XGB for cut > '+str(cut1))
-    plt.savefig('confusion_matrix_extreme_gradient_boosting_whole_data.png')
+    plt.savefig(str(output_path)+'/confusion_matrix_extreme_gradient_boosting_whole_data.png')
 
 
-CM_plot(test_best, bst_train)
+CM_plot(test_best, bst_train, output_path)
diff --git a/library/CBM_ML/plot_tools.py b/library/CBM_ML/plot_tools.py
index 5996144..fce3eee 100644
--- a/library/CBM_ML/plot_tools.py
+++ b/library/CBM_ML/plot_tools.py
@@ -13,7 +13,7 @@
 To find the best threshold which results more signal to background ratio for lambda candidates we use the parameter S0 called the approximate median significance
 by the higgs boson  ML challenge (http://higgsml.lal.in2p3.fr/documentation,9.)
 """
-def AMS(y_true, y_predict, y_true1, y_predict1):
+def AMS(y_true, y_predict, y_true1, y_predict1, output_path):
     roc_auc=roc_auc_score(y_true, y_predict)
     fpr, tpr, thresholds = roc_curve(y_true, y_predict,drop_intermediate=False ,pos_label=1)
     S0 = sqrt(2 * ((tpr + fpr) * log((1 + tpr/fpr)) - tpr))
@@ -43,7 +43,7 @@ def AMS(y_true, y_predict, y_true1, y_predict1):
     plt.ylim([0, 1.02])
     #axs.axis([-0.01, 1, 0.9, 1])
     fig.tight_layout()
-    fig.savefig('hists.png')
+    fig.savefig(str(output_path)+'/hists.png')
     plt.show()
     return S0_best_threshold, S0_best_threshold1
 
@@ -51,11 +51,11 @@ def AMS(y_true, y_predict, y_true1, y_predict1):
 
 """
 To visualize true MC signal in the probability distribution returned by XGB classifier for a train-test data-set, the preds_prob function can be used.
-Its input are a data-frame, predictions of the classifier (probabilities) and the target in the data-frame, and shows how the True signal is present 
+Its input are a data-frame, predictions of the classifier (probabilities) and the target in the data-frame, and shows how the True signal is present
 inside this probability.
 """
 
-def preds_prob(df, preds, true, dataset):
+def preds_prob(df, preds, true, dataset, output_path):
     if dataset =='train':
         label1 = 'XGB Predictions on the training data set'
     else:
@@ -70,15 +70,15 @@ def preds_prob(df, preds, true, dataset):
     err = np.sqrt(hist)
     center = (bins[:-1] + bins[1:]) / 2
 
-    
+
     hist1, bins1 = np.histogram(TN[preds], bins=bins1)
     err1 = np.sqrt(hist1)
     plt.errorbar(center, hist1, yerr=err1, fmt='o',
                  c='Red', label='Background in predictions')
-    
+
     plt.errorbar(center, hist, yerr=err, fmt='o',
                  c='blue', label='Signal in predictions')
-    
+
     ax.set_yscale('log')
     plt.xlabel('Probability',fontsize=18)
     plt.ylabel('Counts', fontsize=18)
@@ -88,18 +88,18 @@ def preds_prob(df, preds, true, dataset):
     ax.tick_params(axis='both', which='minor', labelsize=16)
     plt.show()
     fig.tight_layout()
-    fig.savefig('test_best.png')
+    fig.savefig(str(output_path)+'/test_best.png')
+
 
-    
 """
 A **Confusion Matrix** $C$ is such that $C_{ij}$ is equal to the number of observations known to be in group $i$ and predicted to be in group $j$.
  Thus in binary classification, the count of true positives is $C_{00}$, false negatives $C_{01}$,false positives is $C_{10}$, and true neagtives is $C_{11}$.
 
 If $ y^{'}_{i} $ is the predicted value of the $ i$-th sample and $y_{i}$ is the corresponding true value, then the fraction of correct predictions over
-$ n_{samples}$ is defined as 
+$ n_{samples}$ is defined as
 $$
 True \: positives (y,y^{'}) =  \sum_{i=1}^{n_{samples} } 1 (y^{'}_{i} = y_{i}=1)
-$$ 
+$$
 
 The following function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`.
 """
@@ -159,7 +159,7 @@ def comaprison_XGB_KFPF(XGB_mass,KFPF_mass):
     hist1, bin_edges1 = np.histogram(XGB_mass,range=(1.09, 1.17), bins=300)
     hist2, bin_edges2 = np.histogram(KFPF_mass,range=(1.09, 1.17), bins=300)
 
-    #makes sense to have only positive values 
+    #makes sense to have only positive values
     diff = (hist1 - hist2)
     axs[1].bar(bins[:-1],     # this is what makes it comparable
         ns / ns1, # maybe check for div-by-zero!
@@ -174,9 +174,9 @@ def comaprison_XGB_KFPF(XGB_mass,KFPF_mass):
 
 
 """
-Function that plots signal and background in the train-test data set 
+Function that plots signal and background in the train-test data set
 """
-def plt_sig_back(df):
+def plt_sig_back(df, output_path):
     range1 = (1.077, 1.18)
     fig, axs = plt.subplots(figsize=(10, 6))
     #df_scaled['mass'].plot.hist(bins = 300, range=range1,grid=True,sharey=True)
@@ -193,16 +193,16 @@ def plt_sig_back(df):
     axs.tick_params(axis='both', which='major', labelsize=18)
     plt.yscale("log")
     fig.tight_layout()
-    fig.savefig("hists.png")
-    
+    fig.savefig(str(output_path)+"/hists.png")
+
+
 
-    
 # The following function will display the inavriant mass histogram of the original 10k event set along with the mass histoigram after we apply a cut
 # on the probability prediction of xgb
-def cut_visualization(df, variable,cut, range1=(1.09, 1.19), bins1= 300 ):
+def cut_visualization(df, variable,cut, output_path, range1=(1.09, 1.19), bins1= 300):
     mask1 = df[variable]>cut
     df3=df[mask1]
-    
+
     fig, ax2 = plt.subplots(figsize=(12, 8), dpi = 300)
     color = 'tab:blue'
     ax2.hist(df['mass'],bins = bins1, range=range1, facecolor='blue' ,alpha = 0.35, label='before selection')
@@ -212,9 +212,9 @@ def cut_visualization(df, variable,cut, range1=(1.09, 1.19), bins1= 300 ):
     ax2.tick_params(axis='both', which='major', labelsize=15)
     ax2.grid()
     ax2.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 18)
-    
-    
-    
+
+
+
     color = 'tab:red'
     ax1 = ax2.twinx()
     ax1.hist(df3['mass'], bins = bins1, range=range1, facecolor='red',alpha = 0.35, label="XGB (with a cut > %.2f"%cut+')')
@@ -228,4 +228,4 @@ def cut_visualization(df, variable,cut, range1=(1.09, 1.19), bins1= 300 ):
     #plt.text(0.02, 0.1, r'cut > %.4f'%cut, fontsize=15)
     plt.show()
     fig.tight_layout()
-    fig.savefig("test_best.png")
+    fig.savefig(str(output_path)+"/test_best.png")
diff --git a/library/CBM_ML/tree_importer.py b/library/CBM_ML/tree_importer.py
index f34bd24..4635bf5 100644
--- a/library/CBM_ML/tree_importer.py
+++ b/library/CBM_ML/tree_importer.py
@@ -22,72 +22,43 @@ def tree_importer(path,treename, n):
     return df
 
 
-"""
-The quality_cuts_plus_other_cuts function applies quality selection criteria with other selection criteria to reduce data size
-"""
-def quality_cuts_plus_other_cuts_lambda():
-    #The following quality selection criteria is applied
-    mass_cut = '('+labels[10]+' > 1.07) &'
-
-    coordinate_cut = '('+labels[20]+'>-50) & ('+labels[20]+'<50) & ('+labels[21]+'>-50) & ('+labels[21]+'<50) & ('+labels[22]+'>-1) & ('+labels[22]+'<80) &'
-
-    chi_2_positive_cut ='('+labels[0]+'>0) & ('+labels[8]+'>0) & ('+labels[2]+'>0) & ('+labels[1]+' > 0) &'
-
-    distance_cut = '('+labels[4]+'>0) & ('+labels[5]+'<80) & ('+labels[3]+'>0) & ('+labels[3]+'<100) &'
-
-    pz_cut = '('+labels[14]+'>0) & '
-    #Other cuts
-    pseudo_rapidity_cut_based_on_acceptance = '('+labels[19]+'>1) & ('+labels[19]+'<6.5) &'
-
-    angular_cut = '('+labels[6]+'>0.1) & ('+labels[7]+'>0.1) &'
+def new_labels(df):
+    new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',
+       'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',
+       'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', 'x', 'y', 'z',
+       'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal']
 
-    data_reducing_cut = '('+labels[10]+'< 1.3) &  ('+labels[15]+'<20)  &   ('+labels[0]+' < 1000) &  ('+labels[2]+'<1e6) & ('+labels[1]+ '< 3e7) &  ('+labels[4]+'<5000) & ('+labels[8]+'< 100000)'
-
-    cuts= mass_cut+coordinate_cut+chi_2_positive_cut+distance_cut+pz_cut+pseudo_rapidity_cut_based_on_acceptance+angular_cut+data_reducing_cut
-    return cuts
-
-"""
-Selects the labels only required for lambda analysis
-"""
+    df.columns = new_labels
+    return df
 
-def labels_lambda(file):
-    find_labels = ['chi2_geo','prim_first','prim_second', 'distance','_dl','_l','cosine_first','cosine_second','chi2_topo','cosine_topo','_mass','_pT','_px','_py','_pz','_p','_phi','_rapidity','_pid','_eta','_x','_y','_z','_generation']
-    labels = []
-    for a in find_labels:
-        for s in file.keys():
-            if a in s:
-                if 'err' not in s:
-                    if s not in labels:
-                        labels.append(s)
-    return labels
+def quality_cuts(df):
 
-"""
-This tree_importer_with_cuts imports tree and also applies quality selection criteria on the data along with some further data reducing cuts.
-"""
+    """
+     All the numerical artifacts ( $\chi$2 < 0), inf and nan were deleted. Also applied
+     quality cuts based on detector geometry. Full description could be found in
+     https://docs.google.com/document/d/11f0ZKPW8ftTVhTxeWiog1g6qdsGgN1mlIE3vd5FHLbc/edit?usp=sharing
+     Parameters
+     ------------------
+     df: dataframe
+         dataframe to be cleaned
+    """
 
-def tree_importer_with_cuts(path,treename, n):
+    with pd.option_context('mode.use_inf_as_na', True):
+        df = df.dropna()
 
-    #This part changes the labels of the root tree's branches
+    df = df.dropna()
 
-    new_labels=['chi2geo', 'chi2primneg','chi2primpos', 'distance', 'ldl','mass', 'pT', 'rapidity','issignal']
+    chi2_cut = (df['chi2geo'] > 0) & (df['chi2primpos'] > 0) & (df['chi2primneg'] > 0) &\
+           (df['chi2topo'] > 0)
+    mass_cut = (df['mass'] > 1.077)
 
-    #The number of parallel processors
-    executor = ThreadPoolExecutor(n)
+    coord_cut = (abs(df['x']) < 50) & (abs(df['y']) < 50)
+    dist_l_cut = (df['distance'] > 0) &  (df['distance'] < 100) &\
+                     (df['l'] > 0 )  & (df['ldl'] > 0 ) & (abs(df['l']) < 80)
 
-    #To open the
-    file = uproot.open(path+':'+treename, library='pd', decompression_executor=executor,
-                                  interpretation_executor=executor)
-    labels = labels_lambda(file)
-    cuts = quality_cuts_plus_other_cuts_lambda(labels)
-    select_labels = [labels[0],labels[1],labels[2],labels[3],labels[4],labels[10],labels[11],labels[18],labels[23]]
+    pz_cut = (df['pz'] > 0)
 
-    np_arrays = file.arrays(select_labels, cuts, library='np',decompression_executor=executor,
-                                  interpretation_executor=executor)
-    df= pd.DataFrame(data=np_arrays)
-    df.columns = new_labels
-    #df['issignal']=((df['issignal']>0)*1)
-    with pd.option_context('mode.use_inf_as_na', True):
-        df = df.dropna()
+    cuts = (chi2_cut) & (mass_cut) & (coord_cut) & (dist_l_cut) &\
+    (pz_cut)
 
-    df = df.dropna()
-    return df
+    return df[cuts]

From 133d9c1fc084e261b2b33a5e57785305128b985a Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Tue, 10 Aug 2021 12:09:54 +0200
Subject: [PATCH 09/40] Plotted distributions of trained features

---
 .gitignore                 |  3 ++
 clusterXGB.py              | 83 ++++++++++++++++++++++++++++++++++----
 distributions/var_distr.py | 83 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 161 insertions(+), 8 deletions(-)
 create mode 100644 distributions/var_distr.py

diff --git a/.gitignore b/.gitignore
index 7491591..a5f0f04 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,3 +139,6 @@ cython_debug/
 
 # PNG images
 *.png
+
+# PDF files
+*.pdf
diff --git a/clusterXGB.py b/clusterXGB.py
index 102e9ab..54f845b 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -25,6 +25,9 @@
 import sys
 import os
 
+from distributions.var_distr import hist_variables
+from matplotlib.backends.backend_pdf import PdfPages
+
 
 tree_name = 'PlainTree'
 
@@ -131,14 +134,21 @@ def train_test_set(df_scaled, cuts):
     dtrain = xgb.DMatrix(x_train, label = y_train)
     dtest1=xgb.DMatrix(x_test, label = y_test)
 
-    return dtrain, dtest1,x_train, y_train, y_test
+    return dtrain, dtest1,x_train, x_test, y_train, y_test
 
 
-dtrain, dtest1,x_train, y_train, y_test = train_test_set(df_scaled, cuts)
+dtrain, dtest1,x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
 
 del df_scaled
 gc.collect()
 
+print("x_train: ", x_train)
+print("y_train: ", y_train)
+
+xy_train = pd.concat([x_train, y_train], axis = 1)
+
+
+
 #Bayesian Optimization function for xgboost
 #specify the parameters you want to tune as keyword arguments
 def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):
@@ -215,8 +225,7 @@ def get_best_params():
 preds_prob(bst_test,'xgb_preds', 'issignal','test', output_path)
 
 
-
-def CM_plot(test_best, x_train, output_path):
+def CM_plot(best, x, output_path):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
     the number of observations known to be in group i and predicted to be in
@@ -235,9 +244,9 @@ def CM_plot(test_best, x_train, output_path):
             we want to get confusion matrix on training datasets
     """
     #lets take the best threshold and look at the confusion matrix
-    cut1 = test_best
-    x_train['xgb_preds1'] = ((x_train['xgb_preds']>cut1)*1)
-    cnf_matrix = confusion_matrix(x_train['issignal'], x_train['xgb_preds1'], labels=[1,0])
+    cut1 = best
+    x['xgb_preds1'] = ((x['xgb_preds']>cut1)*1)
+    cnf_matrix = confusion_matrix(x['issignal'], x['xgb_preds1'], labels=[1,0])
     np.set_printoptions(precision=2)
     fig, axs = plt.subplots(figsize=(10, 8))
     axs.yaxis.set_label_coords(-0.04,.5)
@@ -247,4 +256,62 @@ def CM_plot(test_best, x_train, output_path):
     plt.savefig(str(output_path)+'/confusion_matrix_extreme_gradient_boosting_whole_data.png')
 
 
-CM_plot(test_best, bst_train, output_path)
+CM_plot(train_best, bst_train, output_path)
+
+x_train['issignalXGB'] = bst_train['xgb_preds'].values
+x_train['xgb_preds1'] = ((x_train['issignalXGB']>train_best)*1)
+
+x_train['issignal'] = y_train.values
+
+dfs_orig = x_train[x_train['issignal']==1]
+dfb_orig = x_train[x_train['issignal']==0]
+
+dfs_cut = x_train[x_train['xgb_preds1']==1]
+dfb_cut = x_train[x_train['xgb_preds1']==0]
+
+
+print('Train signals: ', len(xy_train[xy_train['issignal']==1]))
+print('Train background: ', len(xy_train[xy_train['issignal']==0]))
+
+print("True signal length: ", len(dfs_orig))
+print("True background length: ", len(dfb_orig))
+
+
+print("Predicted signal length: ", len(dfs_cut))
+print("Predicted background length: ", len(dfb_cut))
+
+
+
+non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
+ 'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
+
+log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance']
+
+new_log_x = []
+
+for cut in cuts:
+    if cut in log_x:
+        dfs_orig[cut+'_log'] = np.log(dfs_orig[cut])
+        dfb_orig[cut+'_log'] = np.log(dfb_orig[cut])
+
+        dfs_cut[cut+'_log'] = np.log(dfs_cut[cut])
+        dfb_cut[cut+'_log'] = np.log(dfb_cut[cut])
+
+        new_log_x.append(cut+'_log')
+
+
+        dfs_orig = dfs_orig.drop([cut], axis=1)
+        dfb_orig = dfb_orig.drop([cut], axis=1)
+
+        dfs_cut = dfs_cut.drop([cut], axis=1)
+        dfb_cut = dfb_cut.drop([cut], axis=1)
+
+    if cut in non_log_x:
+        new_log_x.append(cut)
+
+
+pdf_cuts = PdfPages(output_path+'/'+'dist_cuts.pdf')
+for feat in new_log_x:
+    hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, feat, pdf_cuts)
+
+pdf_cuts.close()
diff --git a/distributions/var_distr.py b/distributions/var_distr.py
new file mode 100644
index 0000000..bb08603
--- /dev/null
+++ b/distributions/var_distr.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import gc
+from matplotlib.backends.backend_pdf import PdfPages
+
+import matplotlib as mpl
+
+
+mpl.rc('figure', max_open_warning = 0)
+
+def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
+    """
+    Applied quality cuts and created distributions for all the features in pdf
+    file
+    Parameters
+    ----------
+    df_s: dataframe
+          signal
+    df_b: dataframe
+          background
+    feature: str
+            name of the feature to be plotted
+    pdf_key: PdfPages object
+            name of pdf document with distributions
+    """
+
+    # fig, ax = plt.subplots(figsize=(20, 10))
+
+    fig, ax = plt.subplots(2, figsize=(20, 10))
+
+    ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'green')
+    ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.2, color = 'magenta')
+    ax[0].legend(shadow=True,title = 'B/S='+ str(round(len(dfb_orig)/len(dfs_orig), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
+              '\n mass > 1.077 Gev/c , pz >0'+
+               '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
+               '\n cosinepos, cosineneg > 0' +
+               '\n distance > 0, distance <100'
+               '\n S samples:  '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0])
+               , title_fontsize=20, fontsize =20)
+
+
+    ax[0].xaxis.set_tick_params(labelsize=25)
+    ax[0].yaxis.set_tick_params(labelsize=25)
+
+    ax[0].set_title(str(feature) + ' MC ', fontsize = 25)
+    ax[0].set_xlabel(feature, fontsize = 25)
+
+
+    ax[0].set_yscale('log')
+
+    fig.tight_layout()
+
+
+    ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'green')
+    ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.2, color = 'magenta')
+    ax[1].legend(shadow=True,title = 'B/S='+ str(round(len(dfb_cut)/len(dfs_cut), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
+              '\n mass > 1.077 Gev/c , pz >0'+
+               '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
+               '\n cosinepos, cosineneg > 0' +
+               '\n distance > 0, distance <100'
+               '\n S samples:  '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) +
+               '\n ML cut'
+               , title_fontsize=20, fontsize =20)
+
+
+    ax[1].xaxis.set_tick_params(labelsize=25)
+    ax[1].yaxis.set_tick_params(labelsize=25)
+
+    ax[1].set_title(feature + ' MC ', fontsize = 25)
+    ax[1].set_xlabel(feature, fontsize = 25)
+
+
+    ax[1].set_yscale('log')
+
+    fig.tight_layout()
+
+    plt.savefig(pdf_key,format='pdf')

From bd19211dba0accb391851bdca3d3723f9126604d Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Tue, 10 Aug 2021 13:53:27 +0200
Subject: [PATCH 10/40] Implemented distribution with all the variables

---
 clusterXGB.py | 54 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 54f845b..e527cfe 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -123,29 +123,29 @@ def train_test_set(df_scaled, cuts):
     cuts: list(contains strings)
           features on which training is based
     """
-    x = df_scaled[cuts].copy()
+    # x = df_scaled[cuts].copy()
+    x = df_scaled.copy()
 
     # The MC information is saved in this y variable
     y =pd.DataFrame(df_scaled['issignal'], dtype='int')
-    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
+    x_train_all, x_test_all, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)
 
+
+    x_train = x_train_all[cuts].copy()
+    x_test = x_test_all[cuts].copy()
     #DMatrix is a internal data structure that used by XGBoost
     # which is optimized for both memory efficiency and training speed.
     dtrain = xgb.DMatrix(x_train, label = y_train)
     dtest1=xgb.DMatrix(x_test, label = y_test)
 
-    return dtrain, dtest1,x_train, x_test, y_train, y_test
+    return dtrain, dtest1,x_train_all,x_train, x_test, y_train, y_test
 
 
-dtrain, dtest1,x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
+dtrain, dtest1,x_train_all, x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
 
 del df_scaled
 gc.collect()
 
-print("x_train: ", x_train)
-print("y_train: ", y_train)
-
-xy_train = pd.concat([x_train, y_train], axis = 1)
 
 
 
@@ -258,27 +258,29 @@ def CM_plot(best, x, output_path):
 
 CM_plot(train_best, bst_train, output_path)
 
-x_train['issignalXGB'] = bst_train['xgb_preds'].values
-x_train['xgb_preds1'] = ((x_train['issignalXGB']>train_best)*1)
-
-x_train['issignal'] = y_train.values
+# x_train['issignalXGB'] = bst_train['xgb_preds'].values
+# x_train['xgb_preds1'] = ((x_train['issignalXGB']>train_best)*1)
+#
+# x_train['issignal'] = y_train.values
+#
+# dfs_orig = x_train[x_train['issignal']==1]
+# dfb_orig = x_train[x_train['issignal']==0]
+#
+# dfs_cut = x_train[x_train['xgb_preds1']==1]
+# dfb_cut = x_train[x_train['xgb_preds1']==0]
 
-dfs_orig = x_train[x_train['issignal']==1]
-dfb_orig = x_train[x_train['issignal']==0]
 
-dfs_cut = x_train[x_train['xgb_preds1']==1]
-dfb_cut = x_train[x_train['xgb_preds1']==0]
 
+x_train_all['issignalXGB'] = bst_train['xgb_preds'].values
+x_train_all['xgb_preds1'] = ((x_train_all['issignalXGB']>train_best)*1)
 
-print('Train signals: ', len(xy_train[xy_train['issignal']==1]))
-print('Train background: ', len(xy_train[xy_train['issignal']==0]))
+x_train_all['issignal'] = y_train.values
 
-print("True signal length: ", len(dfs_orig))
-print("True background length: ", len(dfb_orig))
+dfs_orig = x_train_all[x_train_all['issignal']==1]
+dfb_orig = x_train_all[x_train_all['issignal']==0]
 
-
-print("Predicted signal length: ", len(dfs_cut))
-print("Predicted background length: ", len(dfb_cut))
+dfs_cut = x_train_all[x_train_all['xgb_preds1']==1]
+dfb_cut = x_train_all[x_train_all['xgb_preds1']==0]
 
 
 
@@ -289,7 +291,11 @@ def CM_plot(best, x, output_path):
 
 new_log_x = []
 
-for cut in cuts:
+cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',
+   'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',
+   'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', 'x', 'y', 'z']
+
+for cut in cuts1:
     if cut in log_x:
         dfs_orig[cut+'_log'] = np.log(dfs_orig[cut])
         dfb_orig[cut+'_log'] = np.log(dfb_orig[cut])

From c0f70a98fd8904681880cc80332b6ab7429326b8 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 09:33:19 +0200
Subject: [PATCH 11/40] Corrected definition of B/S after ML cut

---
 clusterXGB.py | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index e527cfe..a59aaf9 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -258,18 +258,6 @@ def CM_plot(best, x, output_path):
 
 CM_plot(train_best, bst_train, output_path)
 
-# x_train['issignalXGB'] = bst_train['xgb_preds'].values
-# x_train['xgb_preds1'] = ((x_train['issignalXGB']>train_best)*1)
-#
-# x_train['issignal'] = y_train.values
-#
-# dfs_orig = x_train[x_train['issignal']==1]
-# dfb_orig = x_train[x_train['issignal']==0]
-#
-# dfs_cut = x_train[x_train['xgb_preds1']==1]
-# dfb_cut = x_train[x_train['xgb_preds1']==0]
-
-
 
 x_train_all['issignalXGB'] = bst_train['xgb_preds'].values
 x_train_all['xgb_preds1'] = ((x_train_all['issignalXGB']>train_best)*1)
@@ -279,9 +267,11 @@ def CM_plot(best, x, output_path):
 dfs_orig = x_train_all[x_train_all['issignal']==1]
 dfb_orig = x_train_all[x_train_all['issignal']==0]
 
-dfs_cut = x_train_all[x_train_all['xgb_preds1']==1]
-dfb_cut = x_train_all[x_train_all['xgb_preds1']==0]
+# dfs_cut = x_train_all[x_train_all['xgb_preds1']==1]
+# dfb_cut = x_train_all[x_train_all['xgb_preds1']==0]
 
+dfs_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==1)]
+dfb_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==0)]
 
 
 non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',

From 2319ad1ad43335dec317c341c4ed595d32dea721 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 09:37:04 +0200
Subject: [PATCH 12/40] Updated Readme.md

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 067fbe7..f75b0c7 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# XGB 
+# XGB
 This repsoitory contains the basic code of machine learning package XGBoost applied to lambda reconstruction at the CBM experiment at GSI/FAIR.
 The code is split into different files so that it is more readible. The library directory contains various files that could be installed locally. For further information visit that directory.
 
@@ -6,3 +6,6 @@ The code is split into different files so that it is more readible. The library
 We have created a Google Colab for our this code and it has detailed information on what has been done in this code and what was the motivation behind it. The colab is available on the following link
 https://colab.research.google.com/drive/1Up8YvcWiYv0NN7nOA5YhlpsiqTsY0eCt?usp=sharing
 
+# Usage
+
+python /path/to_signal.root  /path/to_background.root  output_path

From 54fd1139b7c4f461ea5d4f60b513891b868de592 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 09:38:12 +0200
Subject: [PATCH 13/40] Corrected Readmi.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f75b0c7..52908cb 100644
--- a/README.md
+++ b/README.md
@@ -8,4 +8,4 @@ https://colab.research.google.com/drive/1Up8YvcWiYv0NN7nOA5YhlpsiqTsY0eCt?usp=sh
 
 # Usage
 
-python /path/to_signal.root  /path/to_background.root  output_path
+python clusterXGB.py /path/to_signal.root  /path/to_background.root  output_path

From 411ed725f70ed4022c49db3264f716b8398ae738 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 11:25:28 +0200
Subject: [PATCH 14/40] Changed legend

---
 distributions/var_distr.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index bb08603..08d4956 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -8,6 +8,7 @@
 
 import gc
 from matplotlib.backends.backend_pdf import PdfPages
+from matplotlib.font_manager import FontProperties
 
 import matplotlib as mpl
 
@@ -34,15 +35,20 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
 
     fig, ax = plt.subplots(2, figsize=(20, 10))
 
-    ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'green')
-    ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.2, color = 'magenta')
-    ax[0].legend(shadow=True,title = 'B/S='+ str(round(len(dfb_orig)/len(dfs_orig), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
+
+    fontP = FontProperties()
+    fontP.set_size('xx-large')
+
+    ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
+    ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
+    ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
               '\n mass > 1.077 Gev/c , pz >0'+
                '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
                '\n cosinepos, cosineneg > 0' +
                '\n distance > 0, distance <100'
                '\n S samples:  '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0])
-               , title_fontsize=20, fontsize =20)
+               , title_fontsize=20, fontsize =20, bbox_to_anchor=(1.05, 1),
+                loc='upper left', prop=fontP,)
 
 
     ax[0].xaxis.set_tick_params(labelsize=25)
@@ -57,16 +63,17 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
     fig.tight_layout()
 
 
-    ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'green')
-    ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.2, color = 'magenta')
-    ax[1].legend(shadow=True,title = 'B/S='+ str(round(len(dfb_cut)/len(dfs_cut), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
+    ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
+    ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
+    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
               '\n mass > 1.077 Gev/c , pz >0'+
                '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
                '\n cosinepos, cosineneg > 0' +
                '\n distance > 0, distance <100'
                '\n S samples:  '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) +
                '\n ML cut'
-               , title_fontsize=20, fontsize =20)
+               , title_fontsize=20, fontsize =20, bbox_to_anchor=(1.05, 1),
+                loc='upper left', prop=fontP,)
 
 
     ax[1].xaxis.set_tick_params(labelsize=25)

From 5ecd084cb038a29639ba99fe644c25d867015c1b Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 12:53:00 +0200
Subject: [PATCH 15/40] Added the same range before and after ML cut

---
 distributions/var_distr.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index 08d4956..d6980e0 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -50,7 +50,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
                , title_fontsize=20, fontsize =20, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 
-
+    ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
     ax[0].xaxis.set_tick_params(labelsize=25)
     ax[0].yaxis.set_tick_params(labelsize=25)
 
@@ -76,6 +76,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
                 loc='upper left', prop=fontP,)
 
 
+    ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
     ax[1].xaxis.set_tick_params(labelsize=25)
     ax[1].yaxis.set_tick_params(labelsize=25)
 

From 47a597bdeff8c8195fba1ea22f31febcf610c46f Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 16:03:15 +0200
Subject: [PATCH 16/40] Added difference between signal and background

---
 clusterXGB.py              | 14 ++++++++---
 distributions/var_distr.py | 48 +++++++++++++++++++++++++++-----------
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index a59aaf9..0c36001 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -267,12 +267,17 @@ def CM_plot(best, x, output_path):
 dfs_orig = x_train_all[x_train_all['issignal']==1]
 dfb_orig = x_train_all[x_train_all['issignal']==0]
 
-# dfs_cut = x_train_all[x_train_all['xgb_preds1']==1]
-# dfb_cut = x_train_all[x_train_all['xgb_preds1']==0]
 
 dfs_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==1)]
 dfb_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==0)]
 
+difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
+
+print("x_train_all: ", len(x_train_all))
+print("dfs_orig: ", len(dfs_orig))
+
+print("dfs_cut: ", len(dfs_cut))
+print("difference: ", len(difference_s))
 
 non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
  'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
@@ -293,6 +298,8 @@ def CM_plot(best, x, output_path):
         dfs_cut[cut+'_log'] = np.log(dfs_cut[cut])
         dfb_cut[cut+'_log'] = np.log(dfb_cut[cut])
 
+        difference_s[cut+'_log'] = np.log(difference_s[cut])
+
         new_log_x.append(cut+'_log')
 
 
@@ -301,6 +308,7 @@ def CM_plot(best, x, output_path):
 
         dfs_cut = dfs_cut.drop([cut], axis=1)
         dfb_cut = dfb_cut.drop([cut], axis=1)
+        difference_s = difference_s.drop([cut], axis=1)
 
     if cut in non_log_x:
         new_log_x.append(cut)
@@ -308,6 +316,6 @@ def CM_plot(best, x, output_path):
 
 pdf_cuts = PdfPages(output_path+'/'+'dist_cuts.pdf')
 for feat in new_log_x:
-    hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, feat, pdf_cuts)
+    hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
 
 pdf_cuts.close()
diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index d6980e0..43c684d 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -15,7 +15,7 @@
 
 mpl.rc('figure', max_open_warning = 0)
 
-def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
+def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, pdf_key):
     """
     Applied quality cuts and created distributions for all the features in pdf
     file
@@ -33,7 +33,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
 
     # fig, ax = plt.subplots(figsize=(20, 10))
 
-    fig, ax = plt.subplots(2, figsize=(20, 10))
+    fig, ax = plt.subplots(3, figsize=(20, 10))
 
 
     fontP = FontProperties()
@@ -47,12 +47,13 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
                '\n cosinepos, cosineneg > 0' +
                '\n distance > 0, distance <100'
                '\n S samples:  '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0])
-               , title_fontsize=20, fontsize =20, bbox_to_anchor=(1.05, 1),
+               , title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 
     ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
-    ax[0].xaxis.set_tick_params(labelsize=25)
-    ax[0].yaxis.set_tick_params(labelsize=25)
+
+    ax[0].xaxis.set_tick_params(labelsize=15)
+    ax[0].yaxis.set_tick_params(labelsize=15)
 
     ax[0].set_title(str(feature) + ' MC ', fontsize = 25)
     ax[0].set_xlabel(feature, fontsize = 25)
@@ -65,20 +66,17 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
 
     ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
     ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
-    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
-              '\n mass > 1.077 Gev/c , pz >0'+
-               '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
-               '\n cosinepos, cosineneg > 0' +
-               '\n distance > 0, distance <100'
+    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + 'quality cuts+'
                '\n S samples:  '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) +
                '\n ML cut'
-               , title_fontsize=20, fontsize =20, bbox_to_anchor=(1.05, 1),
+               , title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 
 
     ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
-    ax[1].xaxis.set_tick_params(labelsize=25)
-    ax[1].yaxis.set_tick_params(labelsize=25)
+     
+    ax[1].xaxis.set_tick_params(labelsize=15)
+    ax[1].yaxis.set_tick_params(labelsize=15)
 
     ax[1].set_title(feature + ' MC ', fontsize = 25)
     ax[1].set_xlabel(feature, fontsize = 25)
@@ -88,4 +86,28 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut,feature, pdf_key):
 
     fig.tight_layout()
 
+
+
+
+    ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
+    ax[2].legend(shadow=True,title = 'signal difference',
+                title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
+                loc='upper left', prop=fontP,)
+
+
+    ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
+
+    ax[2].xaxis.set_tick_params(labelsize=15)
+    ax[2].yaxis.set_tick_params(labelsize=15)
+
+    ax[2].set_title(feature + ' MC ', fontsize = 25)
+    ax[2].set_xlabel(feature, fontsize = 25)
+
+
+    ax[2].set_yscale('log')
+
+    fig.tight_layout()
+
+
+
     plt.savefig(pdf_key,format='pdf')

From f114f9d779d89c289d1fe7ecc3e9eda915d3134f Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 16:27:57 +0200
Subject: [PATCH 17/40] Checked distributions for train dataset

---
 clusterXGB.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 0c36001..34f403e 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -138,10 +138,10 @@ def train_test_set(df_scaled, cuts):
     dtrain = xgb.DMatrix(x_train, label = y_train)
     dtest1=xgb.DMatrix(x_test, label = y_test)
 
-    return dtrain, dtest1,x_train_all,x_train, x_test, y_train, y_test
+    return dtrain, dtest1,x_train_all,x_test_all,x_train, x_test, y_train, y_test
 
 
-dtrain, dtest1,x_train_all, x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
+dtrain, dtest1,x_train_all, x_test_all, x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
 
 del df_scaled
 gc.collect()
@@ -256,24 +256,26 @@ def CM_plot(best, x, output_path):
     plt.savefig(str(output_path)+'/confusion_matrix_extreme_gradient_boosting_whole_data.png')
 
 
-CM_plot(train_best, bst_train, output_path)
+CM_plot(test_best, bst_test, output_path)
 
+print("x_train_all: ", len(x_train_all))
+print("x_test_all: ", len(x_test_all))
 
-x_train_all['issignalXGB'] = bst_train['xgb_preds'].values
-x_train_all['xgb_preds1'] = ((x_train_all['issignalXGB']>train_best)*1)
+x_test_all['issignalXGB'] = bst_test['xgb_preds'].values
+x_test_all['xgb_preds1'] = ((x_test_all['issignalXGB']>test_best)*1)
 
-x_train_all['issignal'] = y_train.values
+x_test_all['issignal'] = y_test.values
 
-dfs_orig = x_train_all[x_train_all['issignal']==1]
-dfb_orig = x_train_all[x_train_all['issignal']==0]
+dfs_orig = x_test_all[x_test_all['issignal']==1]
+dfb_orig = x_test_all[x_test_all['issignal']==0]
 
 
-dfs_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==1)]
-dfb_cut = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==0)]
+dfs_cut = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==1)]
+dfb_cut = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==0)]
 
 difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
 
-print("x_train_all: ", len(x_train_all))
+print("x_test_all: ", len(x_test_all))
 print("dfs_orig: ", len(dfs_orig))
 
 print("dfs_cut: ", len(dfs_cut))

From e478420d48c263d32baa46d3145b50d5022163f1 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 11 Aug 2021 16:48:43 +0200
Subject: [PATCH 18/40] Trained on 5 features and corrected legend

---
 clusterXGB.py              | 3 +--
 distributions/var_distr.py | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 34f403e..fc4c23e 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -105,8 +105,7 @@ def data_selection(signal_path, bgr_path, tree, threads):
 
 
 # features to be trained
-cuts = [ 'chi2primneg', 'chi2primpos']
-
+cuts = [ 'chi2primneg', 'chi2primpos','chi2geo','distance', 'ldl']
 
 
 def train_test_set(df_scaled, cuts):
diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index 43c684d..3501eb3 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -66,7 +66,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
 
     ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
     ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
-    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + 'quality cuts+'
+    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + '\nquality cuts+'
                '\n S samples:  '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) +
                '\n ML cut'
                , title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
@@ -74,7 +74,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
 
 
     ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
-     
+
     ax[1].xaxis.set_tick_params(labelsize=15)
     ax[1].yaxis.set_tick_params(labelsize=15)
 

From 3bc2251f5f0094d0f3d095763d4de412ab936435 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Thu, 12 Aug 2021 11:17:56 +0200
Subject: [PATCH 19/40] Works for this set of variables: ['chi2geo',
 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance',  'ldl',    'mass','pT']

---
 clusterXGB.py                   |  6 +++---
 library/CBM_ML/tree_importer.py | 15 +++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index fc4c23e..1b45b7e 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -108,6 +108,7 @@ def data_selection(signal_path, bgr_path, tree, threads):
 cuts = [ 'chi2primneg', 'chi2primpos','chi2geo','distance', 'ldl']
 
 
+
 def train_test_set(df_scaled, cuts):
     """
     To make machine learning algorithms more efficient on unseen data we divide
@@ -287,9 +288,8 @@ def CM_plot(best, x, output_path):
 
 new_log_x = []
 
-cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',
-   'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',
-   'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', 'x', 'y', 'z']
+cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance',  'ldl',
+   'mass','pT']
 
 for cut in cuts1:
     if cut in log_x:
diff --git a/library/CBM_ML/tree_importer.py b/library/CBM_ML/tree_importer.py
index 4635bf5..7a57414 100644
--- a/library/CBM_ML/tree_importer.py
+++ b/library/CBM_ML/tree_importer.py
@@ -23,10 +23,8 @@ def tree_importer(path,treename, n):
 
 
 def new_labels(df):
-    new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',
-       'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',
-       'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', 'x', 'y', 'z',
-       'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal']
+    new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance',  'ldl',
+       'mass','pT', 'issignal']
 
     df.columns = new_labels
     return df
@@ -52,13 +50,10 @@ def quality_cuts(df):
            (df['chi2topo'] > 0)
     mass_cut = (df['mass'] > 1.077)
 
-    coord_cut = (abs(df['x']) < 50) & (abs(df['y']) < 50)
-    dist_l_cut = (df['distance'] > 0) &  (df['distance'] < 100) &\
-                     (df['l'] > 0 )  & (df['ldl'] > 0 ) & (abs(df['l']) < 80)
+    # coord_cut = (abs(df['x']) < 50) & (abs(df['y']) < 50)
+    dist_l_cut = (df['distance'] > 0) &  (df['distance'] < 100) & (df['ldl'] > 0 )
 
-    pz_cut = (df['pz'] > 0)
 
-    cuts = (chi2_cut) & (mass_cut) & (coord_cut) & (dist_l_cut) &\
-    (pz_cut)
+    cuts = (chi2_cut) & (mass_cut)  & (dist_l_cut)
 
     return df[cuts]

From 23c0cee4ef9bfc58893c1c40b9557bf4315bc3ea Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 13 Aug 2021 12:04:30 +0200
Subject: [PATCH 20/40] Plotted distributions for deploy data and decreased
 label's size in cut_visualization

---
 clusterXGB.py                | 75 ++++++++++++++++++++++++++----------
 library/CBM_ML/plot_tools.py | 18 ++++-----
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 1b45b7e..064b6bb 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -4,7 +4,7 @@
 import matplotlib.pyplot as plt
 
 from library.CBM_ML.tree_importer import tree_importer, new_labels, quality_cuts
-from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix
+from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix, cut_visualization
 
 from sklearn.model_selection import train_test_split
 
@@ -44,6 +44,7 @@
 
 output_path = path_list[2]
 
+
 if not os.path.exists(output_path):
     os.mkdir(output_path)
 
@@ -83,14 +84,16 @@ def data_selection(signal_path, bgr_path, tree, threads):
     signal = new_labels(signal)
     df_urqmd = new_labels(df_urqmd)
 
-    signal = quality_cuts(signal)
-    df_urqmd = quality_cuts(df_urqmd)
+    signal_selected = signal
+
+    background_selected = df_urqmd[(df_urqmd['issignal'] == 0)
+                & ((df_urqmd['mass'] > 1.07)
+                & (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227)
+                   & (df_urqmd['mass'] < 1.3))].sample(n=3*(signal.shape[0]))
+
+    print("Signal: ", len(signal_selected))
+    print("background: ", len(background_selected))
 
-    signal_selected = signal[signal['issignal']==1]
-    background_selected = df_urqmd[(df_urqmd['issignal'] == 0) &\
-                                 ((df_urqmd['mass'] > 1.07) &\
-                                 (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) &\
-                                 (df_urqmd['mass'] < 1.3))]
 
     df_scaled = pd.concat([signal_selected, background_selected])
     df_scaled = df_scaled.sample(frac=1)
@@ -100,15 +103,18 @@ def data_selection(signal_path, bgr_path, tree, threads):
     return df_scaled
 
 
+
 df_scaled = data_selection(signal_path, df_urqmd_path, tree_name,
  number_of_threads)
 
+print("Df scaled: ", len(df_scaled))
+
+
 
 # features to be trained
 cuts = [ 'chi2primneg', 'chi2primpos','chi2geo','distance', 'ldl']
 
 
-
 def train_test_set(df_scaled, cuts):
     """
     To make machine learning algorithms more efficient on unseen data we divide
@@ -143,10 +149,30 @@ def train_test_set(df_scaled, cuts):
 
 dtrain, dtest1,x_train_all, x_test_all, x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts)
 
+
 del df_scaled
 gc.collect()
 
+# open deploy dataframe with x and y as well
+def open_deploy_data(deploy_path, tree, threads):
+    deploy_data= tree_importer(deploy_path,tree_name, threads)
+    deploy_data = new_labels(deploy_data)
+
+    return deploy_data
+
+
+def deploy_set(deploy_data, cuts):
+    x_deploy = deploy_data[cuts].copy()
+    y_deploy =pd.DataFrame(deploy_data['issignal'], dtype='int')
+
+    ddeploy=xgb.DMatrix(x_deploy, label = y_deploy)
 
+    return ddeploy, x_deploy, y_deploy
+
+
+deploy_data = open_deploy_data(df_urqmd_path, tree_name, number_of_threads)
+
+ddeploy, x_deploy, y_deploy = deploy_set(deploy_data, cuts)
 
 
 #Bayesian Optimization function for xgboost
@@ -210,6 +236,12 @@ def get_best_params():
 bst_test['issignal']=y_test['issignal']
 
 
+bst_deploy = pd.DataFrame(data=bst.predict(ddeploy, output_margin=False),  columns=["xgb_preds"])
+y_deploy=y_deploy.set_index(np.arange(0,bst_deploy.shape[0]))
+bst_deploy['issignal']=y_deploy['issignal']
+
+
+
 #The following graph will show us that which features are important for the model
 ax = xgb.plot_importance(bst)
 plt.rcParams['figure.figsize'] = [6, 3]
@@ -221,10 +253,10 @@ def get_best_params():
 #ROC cures for the predictions on train and test sets
 train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'], output_path)
 
+
 #The first argument should be a data frame, the second a column in it, in the form 'preds'
 preds_prob(bst_test,'xgb_preds', 'issignal','test', output_path)
 
-
 def CM_plot(best, x, output_path):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to
@@ -258,25 +290,26 @@ def CM_plot(best, x, output_path):
 
 CM_plot(test_best, bst_test, output_path)
 
-print("x_train_all: ", len(x_train_all))
-print("x_test_all: ", len(x_test_all))
 
-x_test_all['issignalXGB'] = bst_test['xgb_preds'].values
-x_test_all['xgb_preds1'] = ((x_test_all['issignalXGB']>test_best)*1)
 
-x_test_all['issignal'] = y_test.values
 
-dfs_orig = x_test_all[x_test_all['issignal']==1]
-dfb_orig = x_test_all[x_test_all['issignal']==0]
+deploy_data['issignalXGB'] = bst_deploy['xgb_preds'].values
+deploy_data['xgb_preds1'] = ((deploy_data['issignalXGB']>test_best)*1)
+
 
 
-dfs_cut = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==1)]
-dfb_cut = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==0)]
+dfs_orig = deploy_data[deploy_data['issignal']==1]
+dfb_orig = deploy_data[deploy_data['issignal']==0]
+
+
+dfs_cut = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==1)]
+dfb_cut = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==0)]
 
 difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
 
-print("x_test_all: ", len(x_test_all))
+print("x_deploy: ", len(deploy_data))
 print("dfs_orig: ", len(dfs_orig))
+print("dfb_orig: ", len(dfb_orig))
 
 print("dfs_cut: ", len(dfs_cut))
 print("difference: ", len(difference_s))
@@ -320,3 +353,5 @@ def CM_plot(best, x, output_path):
     hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
 
 pdf_cuts.close()
+
+cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
diff --git a/library/CBM_ML/plot_tools.py b/library/CBM_ML/plot_tools.py
index fce3eee..af34d6f 100644
--- a/library/CBM_ML/plot_tools.py
+++ b/library/CBM_ML/plot_tools.py
@@ -206,25 +206,25 @@ def cut_visualization(df, variable,cut, output_path, range1=(1.09, 1.19), bins1=
     fig, ax2 = plt.subplots(figsize=(12, 8), dpi = 300)
     color = 'tab:blue'
     ax2.hist(df['mass'],bins = bins1, range=range1, facecolor='blue' ,alpha = 0.35, label='before selection')
-    ax2.set_ylabel('Counts', fontsize = 15, color=color)
+    ax2.set_ylabel('Counts', fontsize = 8, color=color)
     ax2.tick_params(axis='y', labelcolor=color)
-    ax2.legend( fontsize = 15, loc='upper left')
-    ax2.tick_params(axis='both', which='major', labelsize=15)
+    ax2.legend( fontsize = 8, loc='upper left')
+    ax2.tick_params(axis='both', which='major', labelsize=8)
     ax2.grid()
-    ax2.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 18)
+    ax2.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 8)
 
 
 
     color = 'tab:red'
     ax1 = ax2.twinx()
     ax1.hist(df3['mass'], bins = bins1, range=range1, facecolor='red',alpha = 0.35, label="XGB (with a cut > %.2f"%cut+')')
-    ax1.set_xlabel('Mass in GeV', fontsize = 15)
-    ax1.set_ylabel('Counts ', fontsize = 15, color=color)
+    ax1.set_xlabel('Mass in GeV', fontsize = 8)
+    ax1.set_ylabel('Counts ', fontsize = 8, color=color)
     ax1.tick_params(axis='y', labelcolor=color)
-    ax1.tick_params(axis='both', which='major', labelsize=15)
-    ax1.legend( fontsize = 18,loc='upper right' )
+    ax1.tick_params(axis='both', which='major', labelsize=8)
+    ax1.legend( fontsize = 8,loc='upper right' )
 
-    plt.title("The original sample's Invariant Mass along with mass after selection of XGB", fontsize = 15)
+    plt.title("The original sample's Invariant Mass along with mass after selection of XGB", fontsize = 10)
     #plt.text(0.02, 0.1, r'cut > %.4f'%cut, fontsize=15)
     plt.show()
     fig.tight_layout()

From 92267204b937aad690807d13167cf07132b635ce Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 13 Aug 2021 15:26:58 +0200
Subject: [PATCH 21/40] Updated legend

---
 distributions/var_distr.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index 3501eb3..209ab60 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -41,13 +41,11 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
 
     ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
     ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
-    ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + '\n inf, nan was deleted \n $\chi^2$>0 '+
-              '\n mass > 1.077 Gev/c , pz >0'+
-               '\n z > 0, z<80, l > 0, l < 80, ldl > 0, |x|,|y|<50'+
-               '\n cosinepos, cosineneg > 0' +
-               '\n distance > 0, distance <100'
-               '\n S samples:  '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0])
-               , title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
+    ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) +
+
+               '\n S samples:  '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) +
+               '\nquality cuts ',
+               title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 
     ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max())
@@ -58,18 +56,18 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
     ax[0].set_title(str(feature) + ' MC ', fontsize = 25)
     ax[0].set_xlabel(feature, fontsize = 25)
 
-
-    ax[0].set_yscale('log')
+    if feature!='mass':
+        ax[0].set_yscale('log')
 
     fig.tight_layout()
 
 
     ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
     ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red')
-    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + '\nquality cuts+'
+    ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) +
                '\n S samples:  '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) +
-               '\n ML cut'
-               , title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
+               '\nquality cuts + ML cut',
+                title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 
 
@@ -81,8 +79,8 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
     ax[1].set_title(feature + ' MC ', fontsize = 25)
     ax[1].set_xlabel(feature, fontsize = 25)
 
-
-    ax[1].set_yscale('log')
+    if feature!='mass':
+        ax[1].set_yscale('log')
 
     fig.tight_layout()
 
@@ -103,8 +101,8 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
     ax[2].set_title(feature + ' MC ', fontsize = 25)
     ax[2].set_xlabel(feature, fontsize = 25)
 
-
-    ax[2].set_yscale('log')
+    if feature!='mass':
+        ax[2].set_yscale('log')
 
     fig.tight_layout()
 

From a82eddfda7b5b2820b70d15404f851744d81010e Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Sat, 14 Aug 2021 21:18:39 +0200
Subject: [PATCH 22/40] Changes pic's title

---
 clusterXGB.py                | 2 +-
 library/CBM_ML/plot_tools.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 064b6bb..879a5d6 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -348,7 +348,7 @@ def CM_plot(best, x, output_path):
         new_log_x.append(cut)
 
 
-pdf_cuts = PdfPages(output_path+'/'+'dist_cuts.pdf')
+pdf_cuts = PdfPages(output_path+'/'+'dist_cuts_urqmd.pdf')
 for feat in new_log_x:
     hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
 
diff --git a/library/CBM_ML/plot_tools.py b/library/CBM_ML/plot_tools.py
index af34d6f..db02148 100644
--- a/library/CBM_ML/plot_tools.py
+++ b/library/CBM_ML/plot_tools.py
@@ -88,7 +88,7 @@ def preds_prob(df, preds, true, dataset, output_path):
     ax.tick_params(axis='both', which='minor', labelsize=16)
     plt.show()
     fig.tight_layout()
-    fig.savefig(str(output_path)+'/test_best.png')
+    fig.savefig(str(output_path)+'/test_best_pred.png')
 
 
 """
@@ -228,4 +228,4 @@ def cut_visualization(df, variable,cut, output_path, range1=(1.09, 1.19), bins1=
     #plt.text(0.02, 0.1, r'cut > %.4f'%cut, fontsize=15)
     plt.show()
     fig.tight_layout()
-    fig.savefig(str(output_path)+"/test_best.png")
+    fig.savefig(str(output_path)+"/test_best_cut_inv_mass.png")

From 06b751eadcea34bc028e99f9d7b77eaf3f4e9598 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 16 Aug 2021 10:22:35 +0200
Subject: [PATCH 23/40] Implemented function to plot variables before and after
 ML cut

---
 clusterXGB.py | 57 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 21 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 879a5d6..c4af526 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -297,7 +297,6 @@ def CM_plot(best, x, output_path):
 deploy_data['xgb_preds1'] = ((deploy_data['issignalXGB']>test_best)*1)
 
 
-
 dfs_orig = deploy_data[deploy_data['issignal']==1]
 dfb_orig = deploy_data[deploy_data['issignal']==0]
 
@@ -324,34 +323,50 @@ def CM_plot(best, x, output_path):
 cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance',  'ldl',
    'mass','pT']
 
-for cut in cuts1:
-    if cut in log_x:
-        dfs_orig[cut+'_log'] = np.log(dfs_orig[cut])
-        dfb_orig[cut+'_log'] = np.log(dfb_orig[cut])
 
-        dfs_cut[cut+'_log'] = np.log(dfs_cut[cut])
-        dfb_cut[cut+'_log'] = np.log(dfb_cut[cut])
+pdf_cuts_deploy = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf')
 
-        difference_s[cut+'_log'] = np.log(difference_s[cut])
 
-        new_log_x.append(cut+'_log')
+cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
 
 
-        dfs_orig = dfs_orig.drop([cut], axis=1)
-        dfb_orig = dfb_orig.drop([cut], axis=1)
+def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
+    difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
+    non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
+     'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
 
-        dfs_cut = dfs_cut.drop([cut], axis=1)
-        dfb_cut = dfb_cut.drop([cut], axis=1)
-        difference_s = difference_s.drop([cut], axis=1)
+    log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance']
 
-    if cut in non_log_x:
-        new_log_x.append(cut)
+    new_log_x = []
 
+    for cut in cuts_plot:
+        if cut in log_x:
+            dfs_orig[cut+'_log'] = np.log(dfs_orig[cut])
+            dfb_orig[cut+'_log'] = np.log(dfb_orig[cut])
 
-pdf_cuts = PdfPages(output_path+'/'+'dist_cuts_urqmd.pdf')
-for feat in new_log_x:
-    hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
+            dfs_cut[cut+'_log'] = np.log(dfs_cut[cut])
+            dfb_cut[cut+'_log'] = np.log(dfb_cut[cut])
 
-pdf_cuts.close()
+            difference_s[cut+'_log'] = np.log(difference_s[cut])
 
-cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
+            new_log_x.append(cut+'_log')
+
+
+            dfs_orig = dfs_orig.drop([cut], axis=1)
+            dfb_orig = dfb_orig.drop([cut], axis=1)
+
+            dfs_cut = dfs_cut.drop([cut], axis=1)
+            dfb_cut = dfb_cut.drop([cut], axis=1)
+            difference_s = difference_s.drop([cut], axis=1)
+
+        if cut in non_log_x:
+            new_log_x.append(cut)
+
+
+    # pdf_cuts = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf')
+    for feat in new_log_x:
+        hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
+
+    pdf_cuts.close()
+
+variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts1, pdf_cuts_deploy)

From b9f2846c11ed38cc979806f0f66f59ea87370705 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 16 Aug 2021 11:18:55 +0200
Subject: [PATCH 24/40] Plotted distributions for train, test and deploy
 datasets

---
 clusterXGB.py | 49 ++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index c4af526..3c246d1 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -292,26 +292,44 @@ def CM_plot(best, x, output_path):
 
 
 
+dfs_orig = deploy_data[deploy_data['issignal']==1]
+dfb_orig = deploy_data[deploy_data['issignal']==0]
+
+x_train_all['issignalXGB'] = bst_train['xgb_preds'].values
+x_train_all['xgb_preds1'] = ((x_train_all['issignalXGB']>train_best)*1)
+
+x_test_all['issignalXGB'] = bst_test['xgb_preds'].values
+x_test_all['xgb_preds1'] = ((x_test_all['issignalXGB']>test_best)*1)
+
 
 deploy_data['issignalXGB'] = bst_deploy['xgb_preds'].values
 deploy_data['xgb_preds1'] = ((deploy_data['issignalXGB']>test_best)*1)
 
 
-dfs_orig = deploy_data[deploy_data['issignal']==1]
-dfb_orig = deploy_data[deploy_data['issignal']==0]
+dfs_cut_train = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==1)]
+dfb_cut_train = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==0)]
+
+difference_s_train = pd.concat([dfs_orig, dfs_cut_train]).drop_duplicates(keep=False)
 
 
-dfs_cut = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==1)]
-dfb_cut = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==0)]
 
-difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
+dfs_cut_test = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==1)]
+dfb_cut_test = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==0)]
 
-print("x_deploy: ", len(deploy_data))
-print("dfs_orig: ", len(dfs_orig))
-print("dfb_orig: ", len(dfb_orig))
+difference_s_test = pd.concat([dfs_orig, dfs_cut_test]).drop_duplicates(keep=False)
 
-print("dfs_cut: ", len(dfs_cut))
-print("difference: ", len(difference_s))
+
+dfs_cut_d = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==1)]
+dfb_cut_d = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==0)]
+
+difference_s_d = pd.concat([dfs_orig, dfs_cut_d]).drop_duplicates(keep=False)
+
+# print("x_deploy: ", len(deploy_data))
+# print("dfs_orig: ", len(dfs_orig))
+# print("dfb_orig: ", len(dfb_orig))
+#
+# print("dfs_cut: ", len(dfs_cut))
+# print("difference: ", len(difference_s))
 
 non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
  'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
@@ -324,12 +342,16 @@ def CM_plot(best, x, output_path):
    'mass','pT']
 
 
+
+pdf_cuts_train = PdfPages(output_path+'/'+'dist_cuts_urqmd_train.pdf')
+pdf_cuts_test = PdfPages(output_path+'/'+'dist_cuts_urqmd_test.pdf')
 pdf_cuts_deploy = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf')
 
 
-cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
 
 
+cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
+
 def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
     difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
     non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
@@ -363,10 +385,11 @@ def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_
             new_log_x.append(cut)
 
 
-    # pdf_cuts = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf')
     for feat in new_log_x:
         hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
 
     pdf_cuts.close()
 
-variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts1, pdf_cuts_deploy)
+variables_distribution(dfs_orig, dfb_orig, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
+variables_distribution(dfs_orig, dfb_orig, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
+variables_distribution(dfs_orig, dfb_orig, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)

From 27fea11beb2ff4f7bf5cc990ea44fc6a8fa4e9e5 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 16 Aug 2021 13:33:17 +0200
Subject: [PATCH 25/40] Implemented function to create dataframe for plotting
 variables

---
 clusterXGB.py | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 3c246d1..b7ad808 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -295,41 +295,33 @@ def CM_plot(best, x, output_path):
 dfs_orig = deploy_data[deploy_data['issignal']==1]
 dfb_orig = deploy_data[deploy_data['issignal']==0]
 
-x_train_all['issignalXGB'] = bst_train['xgb_preds'].values
-x_train_all['xgb_preds1'] = ((x_train_all['issignalXGB']>train_best)*1)
 
-x_test_all['issignalXGB'] = bst_test['xgb_preds'].values
-x_test_all['xgb_preds1'] = ((x_test_all['issignalXGB']>test_best)*1)
 
+def df_distribution(df, bst_df, df_best):
+    df['issignalXGB'] = bst_df['xgb_preds'].values
+    df['xgb_preds1'] = ((df['issignalXGB']>df_best)*1)
 
-deploy_data['issignalXGB'] = bst_deploy['xgb_preds'].values
-deploy_data['xgb_preds1'] = ((deploy_data['issignalXGB']>test_best)*1)
 
+    dfs_cut = df[(df['xgb_preds1']==1) & (df['issignal']==1)]
+    dfb_cut = df[(df['xgb_preds1']==1) & (df['issignal']==0)]
 
-dfs_cut_train = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==1)]
-dfb_cut_train = x_train_all[(x_train_all['xgb_preds1']==1) & (x_train_all['issignal']==0)]
+    difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
 
-difference_s_train = pd.concat([dfs_orig, dfs_cut_train]).drop_duplicates(keep=False)
+    return dfs_cut, dfb_cut, difference_s
 
 
 
-dfs_cut_test = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==1)]
-dfb_cut_test = x_test_all[(x_test_all['xgb_preds1']==1) & (x_test_all['issignal']==0)]
+dfs_cut_train, dfb_cut_train, difference_s_train = df_distribution(x_train_all,
+ bst_train, train_best)
 
-difference_s_test = pd.concat([dfs_orig, dfs_cut_test]).drop_duplicates(keep=False)
 
+dfs_cut_test, dfb_cut_test, difference_s_test = df_distribution(x_test_all,
+ bst_test, test_best)
 
-dfs_cut_d = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==1)]
-dfb_cut_d = deploy_data[(deploy_data['xgb_preds1']==1) & (deploy_data['issignal']==0)]
 
-difference_s_d = pd.concat([dfs_orig, dfs_cut_d]).drop_duplicates(keep=False)
+dfs_cut_d, dfb_cut_d, difference_s_d = df_distribution(deploy_data,
+ bst_deploy, test_best)
 
-# print("x_deploy: ", len(deploy_data))
-# print("dfs_orig: ", len(dfs_orig))
-# print("dfb_orig: ", len(dfb_orig))
-#
-# print("dfs_cut: ", len(dfs_cut))
-# print("difference: ", len(difference_s))
 
 non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
  'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
@@ -350,7 +342,7 @@ def CM_plot(best, x, output_path):
 
 
 
-cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
+# cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
 
 def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
     difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)

From d33d0215a3e741eb818bed9c876081663974a40d Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 16 Aug 2021 16:55:56 +0200
Subject: [PATCH 26/40] Fixed signal dirrerence(before and after ML cut)
 computation bug

---
 clusterXGB.py              | 43 +++++++++++++++++++++++++-------------
 distributions/var_distr.py |  2 +-
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index b7ad808..6ccabd6 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -26,6 +26,7 @@
 import os
 
 from distributions.var_distr import hist_variables
+from distributions.var1Dcorr import vars, calculate_correlation, plot1Dcorrelation
 from matplotlib.backends.backend_pdf import PdfPages
 
 
@@ -290,10 +291,27 @@ def CM_plot(best, x, output_path):
 
 CM_plot(test_best, bst_test, output_path)
 
+def original_SB(df):
+    dfs_orig = df[df['issignal']==1]
+    dfb_orig = df[df['issignal']==0]
 
+    return dfs_orig, dfb_orig
 
-dfs_orig = deploy_data[deploy_data['issignal']==1]
-dfb_orig = deploy_data[deploy_data['issignal']==0]
+
+dfs_orig_train, dfb_orig_train = original_SB(x_train_all)
+dfs_orig_test, dfb_orig_test = original_SB(x_test_all)
+dfs_orig_d, dfb_orig_d = original_SB(deploy_data)
+
+# dfs_orig = deploy_data[deploy_data['issignal']==1]
+# dfb_orig = deploy_data[deploy_data['issignal']==0]
+
+# vars_to_draw_corr = vars(dfs_orig, 'mass')
+#
+# corr_signal, corr_signal_errors = calculate_correlation(dfs_orig, vars_to_draw_corr, 'mass')
+# corr_bg, corr_bg_errors = calculate_correlation(dfb_orig, vars_to_draw_corr, 'mass')
+#
+# plot1Dcorrelation(vars_to_draw_corr,'mass', corr_signal, corr_signal_errors,
+#  corr_bg, corr_bg_errors, output_path)
 
 
 
@@ -305,21 +323,20 @@ def df_distribution(df, bst_df, df_best):
     dfs_cut = df[(df['xgb_preds1']==1) & (df['issignal']==1)]
     dfb_cut = df[(df['xgb_preds1']==1) & (df['issignal']==0)]
 
-    difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
 
-    return dfs_cut, dfb_cut, difference_s
+    return dfs_cut, dfb_cut
 
 
 
-dfs_cut_train, dfb_cut_train, difference_s_train = df_distribution(x_train_all,
+dfs_cut_train, dfb_cut_train = df_distribution(x_train_all,
  bst_train, train_best)
 
 
-dfs_cut_test, dfb_cut_test, difference_s_test = df_distribution(x_test_all,
+dfs_cut_test, dfb_cut_test = df_distribution(x_test_all,
  bst_test, test_best)
 
 
-dfs_cut_d, dfb_cut_d, difference_s_d = df_distribution(deploy_data,
+dfs_cut_d, dfb_cut_d = df_distribution(deploy_data,
  bst_deploy, test_best)
 
 
@@ -340,12 +357,10 @@ def df_distribution(df, bst_df, df_best):
 pdf_cuts_deploy = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf')
 
 
-
-
-# cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
+cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
 
 def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
-    difference_s = pd.concat([dfs_orig, dfs_cut]).drop_duplicates(keep=False)
+    difference_s = pd.concat([dfs_orig[cuts1], dfs_cut[cuts1]]).drop_duplicates(keep=False)
     non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
      'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
 
@@ -382,6 +397,6 @@ def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_
 
     pdf_cuts.close()
 
-variables_distribution(dfs_orig, dfb_orig, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
-variables_distribution(dfs_orig, dfb_orig, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
-variables_distribution(dfs_orig, dfb_orig, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)
+variables_distribution(dfs_orig_train, dfb_orig_train, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
+variables_distribution(dfs_orig_test, dfb_orig_test, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
+variables_distribution(dfs_orig_d, dfb_orig_d, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)
diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index 209ab60..7e4285c 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -88,7 +88,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
 
 
     ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue')
-    ax[2].legend(shadow=True,title = 'signal difference',
+    ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference',
                 title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1),
                 loc='upper left', prop=fontP,)
 

From 15d57e8e9b8ee90ddb72ab32c460d1b4aace9881 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 16 Aug 2021 17:30:39 +0200
Subject: [PATCH 27/40] Added 1D correlation with some variable

---
 clusterXGB.py              | 20 +++++++++---------
 distributions/var1Dcorr.py | 43 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 10 deletions(-)
 create mode 100644 distributions/var1Dcorr.py

diff --git a/clusterXGB.py b/clusterXGB.py
index 6ccabd6..d5f004f 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -302,16 +302,16 @@ def original_SB(df):
 dfs_orig_test, dfb_orig_test = original_SB(x_test_all)
 dfs_orig_d, dfb_orig_d = original_SB(deploy_data)
 
-# dfs_orig = deploy_data[deploy_data['issignal']==1]
-# dfb_orig = deploy_data[deploy_data['issignal']==0]
-
-# vars_to_draw_corr = vars(dfs_orig, 'mass')
-#
-# corr_signal, corr_signal_errors = calculate_correlation(dfs_orig, vars_to_draw_corr, 'mass')
-# corr_bg, corr_bg_errors = calculate_correlation(dfb_orig, vars_to_draw_corr, 'mass')
-#
-# plot1Dcorrelation(vars_to_draw_corr,'mass', corr_signal, corr_signal_errors,
-#  corr_bg, corr_bg_errors, output_path)
+dfs_orig = deploy_data[deploy_data['issignal']==1]
+dfb_orig = deploy_data[deploy_data['issignal']==0]
+
+vars_to_draw_corr = vars(dfs_orig, 'mass')
+
+corr_signal, corr_signal_errors = calculate_correlation(dfs_orig, vars_to_draw_corr, 'mass')
+corr_bg, corr_bg_errors = calculate_correlation(dfb_orig, vars_to_draw_corr, 'mass')
+
+plot1Dcorrelation(vars_to_draw_corr,'mass', corr_signal, corr_signal_errors,
+ corr_bg, corr_bg_errors, output_path)
 
 
 
diff --git a/distributions/var1Dcorr.py b/distributions/var1Dcorr.py
new file mode 100644
index 0000000..1724b1a
--- /dev/null
+++ b/distributions/var1Dcorr.py
@@ -0,0 +1,43 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+def vars(df, var_to_corr):
+    vars_to_draw = list(df)
+    vars_to_draw.remove(var_to_corr)
+    return vars_to_draw
+
+
+def calculate_correlation(df, vars_to_corr, target_var) :
+
+    from scipy.stats import sem
+
+    mean = df[target_var].mean()
+    sigma = df[target_var].std()
+
+    correlation = []
+    error = []
+
+    for j in vars_to_corr :
+        mean_j = df[j].mean()
+        sigma_j = df[j].std()
+
+        cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j)
+        correlation.append(cov.mean())
+        error.append(sem(cov))
+
+    return correlation, error
+
+
+def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path):
+    fig, ax = plt.subplots(figsize=(20,10))
+    plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='')
+    plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='')
+    ax.grid(zorder=0)
+    ax.set_xticklabels(vars_to_draw, fontsize=25, rotation =70)
+    ax.set_yticklabels([-0.5,-0.4,  -0.2,0, -0.2, 0.4], fontsize=25)
+    plt.legend(('signal','background'), fontsize = 25)
+    plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 25)
+    plt.ylabel('Correlation coefficient', fontsize = 25)
+    fig.tight_layout()
+    fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png')

From bdea65130a486a1a8eef90953c87ec2c8a9b438a Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Tue, 17 Aug 2021 17:55:05 +0200
Subject: [PATCH 28/40] Implemented pt-rapidity 2d distributions, need to fix
 difference bug

---
 distributions/pt_rapidity.py | 99 ++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 distributions/pt_rapidity.py

diff --git a/distributions/pt_rapidity.py b/distributions/pt_rapidity.py
new file mode 100644
index 0000000..c1a0a70
--- /dev/null
+++ b/distributions/pt_rapidity.py
@@ -0,0 +1,99 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+from matplotlib.font_manager import FontProperties
+
+from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
+                               AutoMinorLocator)
+
+import gc
+import matplotlib as mpl
+
+
+
+mpl.rc('figure', max_open_warning = 0)
+
+
+
+def pT_vs_rapidity(df_orig, df_cut, difference, sign, x_range, y_range, output_path, data_name):
+    fig, axs = plt.subplots(ncols=3, figsize=(15, 4))
+
+
+    if sign ==0:
+        s_label = 'Background'
+        m = 5
+
+    if sign==1:
+        s_label = 'Signal'
+        m = 1
+
+    rej = round((1 -  (df_cut.shape[0] / df_orig.shape[0])) * 100, 5)
+    diff = df_orig.shape[0] - df_cut.shape[0]
+    axs[1].legend(shadow=True,title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label,
+     fontsize =15)
+
+    counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100,
+                norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow)
+
+    axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16)
+    axs[0].set_xlabel('rapidity', fontsize=15)
+    axs[0].set_ylabel('pT, GeV', fontsize=15)
+
+
+    mpl.pyplot.colorbar(im0, ax = axs[0])
+
+
+
+    axs[0].xaxis.set_major_locator(MultipleLocator(1))
+    axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d'))
+
+    axs[0].xaxis.set_tick_params(which='both', width=2)
+
+
+    fig.tight_layout()
+
+
+    counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100,
+                norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow)
+
+    axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16)
+    axs[1].set_xlabel('rapidity', fontsize=15)
+    axs[1].set_ylabel('pT, GeV', fontsize=15)
+
+    mpl.pyplot.colorbar(im1, ax = axs[1])
+
+
+
+
+
+    axs[1].xaxis.set_major_locator(MultipleLocator(1))
+    axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d'))
+
+    axs[1].xaxis.set_tick_params(which='both', width=2)
+
+    fig.tight_layout()
+
+
+    counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100,
+                norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow)
+
+    axs[2].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16)
+    axs[2].set_xlabel('rapidity', fontsize=15)
+    axs[2].set_ylabel('pT, GeV', fontsize=15)
+
+    mpl.pyplot.colorbar(im1, ax = axs[1])
+
+
+
+
+
+    axs[2].xaxis.set_major_locator(MultipleLocator(1))
+    axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d'))
+
+    axs[2].xaxis.set_tick_params(which='both', width=2)
+
+    fig.tight_layout()
+
+    fig.savefig(output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png')

From 4317471cd10e6376af3b3cc8b32309beddce1c53 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 18 Aug 2021 11:42:11 +0200
Subject: [PATCH 29/40] Fixed label bug

---
 clusterXGB.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index d5f004f..348f778 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -3,7 +3,7 @@
 import xgboost as xgb
 import matplotlib.pyplot as plt
 
-from library.CBM_ML.tree_importer import tree_importer, new_labels, quality_cuts
+from library.CBM_ML.tree_importer import tree_importer
 from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix, cut_visualization
 
 from sklearn.model_selection import train_test_split
@@ -27,6 +27,7 @@
 
 from distributions.var_distr import hist_variables
 from distributions.var1Dcorr import vars, calculate_correlation, plot1Dcorrelation
+from distributions.pt_rapidity import pT_vs_rapidity
 from matplotlib.backends.backend_pdf import PdfPages
 
 
@@ -82,8 +83,6 @@ def data_selection(signal_path, bgr_path, tree, threads):
     signal= tree_importer(signal_path,tree_name, threads)
     df_urqmd = tree_importer(bgr_path, tree_name, threads)
 
-    signal = new_labels(signal)
-    df_urqmd = new_labels(df_urqmd)
 
     signal_selected = signal
 
@@ -157,7 +156,6 @@ def train_test_set(df_scaled, cuts):
 # open deploy dataframe with x and y as well
 def open_deploy_data(deploy_path, tree, threads):
     deploy_data= tree_importer(deploy_path,tree_name, threads)
-    deploy_data = new_labels(deploy_data)
 
     return deploy_data
 
@@ -347,7 +345,7 @@ def df_distribution(df, bst_df, df_best):
 
 new_log_x = []
 
-cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance',  'ldl',
+cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance',  'ldl', 'rapidity',
    'mass','pT']
 
 
@@ -400,3 +398,15 @@ def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_
 variables_distribution(dfs_orig_train, dfb_orig_train, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
 variables_distribution(dfs_orig_test, dfb_orig_test, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
 variables_distribution(dfs_orig_d, dfb_orig_d, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)
+
+
+def difference_df(df_orig, df_cut, cut):
+    return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False)
+
+
+rapidity_range_unclean = [-1.5, 1.5]
+rapidity_range_clean = [-0.5, 5]
+pT_range = [-0.5, 3]
+
+pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
+ 1, rapidity_range_clean, pT_range, output_path, ' train')

From 92f78af3664dba3324eb5394ba9197fdc5a09235 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 18 Aug 2021 14:13:12 +0200
Subject: [PATCH 30/40] Fixed difference plot bug

---
 distributions/pt_rapidity.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/distributions/pt_rapidity.py b/distributions/pt_rapidity.py
index c1a0a70..43f6544 100644
--- a/distributions/pt_rapidity.py
+++ b/distributions/pt_rapidity.py
@@ -18,7 +18,7 @@
 
 
 def pT_vs_rapidity(df_orig, df_cut, difference, sign, x_range, y_range, output_path, data_name):
-    fig, axs = plt.subplots(ncols=3, figsize=(15, 4))
+    fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]})
 
 
     if sign ==0:
@@ -29,10 +29,17 @@ def pT_vs_rapidity(df_orig, df_cut, difference, sign, x_range, y_range, output_p
         s_label = 'Signal'
         m = 1
 
+    axs[0].set_aspect(aspect = 'auto')
+    axs[1].set_aspect(aspect = 'auto')
+    axs[2].set_aspect(aspect = 'auto')
+
     rej = round((1 -  (df_cut.shape[0] / df_orig.shape[0])) * 100, 5)
     diff = df_orig.shape[0] - df_cut.shape[0]
-    axs[1].legend(shadow=True,title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label,
-     fontsize =15)
+    axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14)
+    axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14)
+    axs[2].legend(shadow=True, title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label +
+    '\n ' + str(diff)+ ' samples were rejected ',
+     fontsize =14)
 
     counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100,
                 norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow)
@@ -79,11 +86,11 @@ def pT_vs_rapidity(df_orig, df_cut, difference, sign, x_range, y_range, output_p
     counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100,
                 norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow)
 
-    axs[2].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16)
+    axs[2].set_title(s_label + ' difference ', fontsize = 16)
     axs[2].set_xlabel('rapidity', fontsize=15)
     axs[2].set_ylabel('pT, GeV', fontsize=15)
 
-    mpl.pyplot.colorbar(im1, ax = axs[1])
+    mpl.pyplot.colorbar(im1, ax = axs[2])
 
 
 

From 955a5cd1445047e0f74bd9a86ffa4c56155b44dc Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Wed, 18 Aug 2021 14:32:33 +0200
Subject: [PATCH 31/40] Plotted pt-rapidity 2D histogram for train and deploy
 sets

---
 clusterXGB.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clusterXGB.py b/clusterXGB.py
index 348f778..528e57d 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -410,3 +410,9 @@ def difference_df(df_orig, df_cut, cut):
 
 pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
  1, rapidity_range_clean, pT_range, output_path, ' train')
+
+pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1),
+ 1, rapidity_range_clean, pT_range, output_path, ' test')
+
+pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
+  1, rapidity_range_clean, pT_range, output_path, ' deploy')

From 6cb514558c3104537f910a4e009f0c0cb34bb8e5 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 00:40:05 +0200
Subject: [PATCH 32/40] Implemented preliminary version of 2D hist

---
 distributions/twoD_mass_pT.py | 53 +++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 distributions/twoD_mass_pT.py

diff --git a/distributions/twoD_mass_pT.py b/distributions/twoD_mass_pT.py
new file mode 100644
index 0000000..0d75175
--- /dev/null
+++ b/distributions/twoD_mass_pT.py
@@ -0,0 +1,53 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sn
+from matplotlib.backends.backend_pdf import PdfPages
+import matplotlib as mpl
+
+def plot2D(df, sgn,x_axis_value, y_axis_value, range_x, range_y, pdf_key):
+    """
+
+    Plots 2-D histogram.
+
+    x_axis_value: e.g. 'mass'
+    y_axis_value: e.g. 'distance'
+    range_x: e.g. [1, 1.177]
+    range_y: e.g [0, 100]
+    folder: e.g. 'folder'
+
+    """
+
+    fig, axs = plt.subplots(figsize=(15, 10))
+    cax = plt.hist2d(df[x_axis_value],df[y_axis_value],range=[range_x, range_y], bins=100,
+                norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis)
+
+
+    if x_axis_value=='mass':
+        unit = r' $, \frac{GeV}{c^2}$ '
+        plt.vlines(x=1.115683,ymin=range_y[0],ymax=range_y[1], color='r', linestyle='-')
+
+    if x_axis_value=='pT':
+        unit = r' $, \frac{GeV}{c}$'
+
+
+    if sgn==1:
+        plt.title('Signal candidates, I love them', fontsize = 25)
+
+    if sgn==0:
+        plt.title('Background candidates', fontsize = 25)
+
+
+    plt.xlabel(x_axis_value+unit, fontsize=25)
+    plt.ylabel(y_axis_value, fontsize=25)
+
+    plt.rcParams['font.size'] = '25'
+    for label in(axs.get_xticklabels() + axs.get_yticklabels()):
+        label.set_fontsize(25)
+
+    mpl.pyplot.colorbar()
+
+    plt.legend(shadow=True,title =str(len(df))+ " samples")
+
+    fig.tight_layout()
+    plt.savefig(pdf_key,format='pdf')

From b2045a900a2a786210e632752e243bc57b271105 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 00:41:12 +0200
Subject: [PATCH 33/40] Added 2d hist invoke

---
 clusterXGB.py | 69 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 528e57d..726725f 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -28,6 +28,7 @@
 from distributions.var_distr import hist_variables
 from distributions.var1Dcorr import vars, calculate_correlation, plot1Dcorrelation
 from distributions.pt_rapidity import pT_vs_rapidity
+from distributions.twoD_mass_pT import plot2D
 from matplotlib.backends.backend_pdf import PdfPages
 
 
@@ -313,6 +314,50 @@ def original_SB(df):
 
 
 
+
+pdf_2d_mass_feat = PdfPages(output_path+'/'+'2d_mass_feat_train.pdf')
+# for var in vars_to_draw_corr:
+#     plot2D(dfs_orig_train, 'mass', var, [1.108, 1.1227], [dfs_orig_train[var].min(),
+#     dfs_orig_train[var].min()], pdf_2d_mass_feat)
+#
+# pdf_2d_mass_feat.close()
+
+
+def dist2D(df, vars,x_range, pdf):
+
+    df_new = df.copy()
+
+    non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
+     'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
+
+    log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance']
+
+    new_log_x = []
+
+    for var in vars:
+        if var in log_x:
+            df_new[var+'_log'] = np.log(df_new[var])
+            df_new = df_new.drop([var], axis=1)
+            new_log_x.append(var+'_log')
+
+        if var in non_log_x:
+            new_log_x.append(var)
+
+    print(df_new.columns)
+    print("DF: ", df_new)
+    print("new log_x: ", new_log_x)
+
+    for feat in new_log_x:
+
+        plot2D(df_new, 1, 'mass', feat, x_range,[df_new[feat].min(),
+        df_new[feat].max()], pdf)
+
+
+    pdf.close()
+
+dist2D(dfs_orig_train,  ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance',  'ldl', 'rapidity',
+   'pT'], [1.108, 1.1227], pdf_2d_mass_feat)
+
 def df_distribution(df, bst_df, df_best):
     df['issignalXGB'] = bst_df['xgb_preds'].values
     df['xgb_preds1'] = ((df['issignalXGB']>df_best)*1)
@@ -404,15 +449,15 @@ def difference_df(df_orig, df_cut, cut):
     return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False)
 
 
-rapidity_range_unclean = [-1.5, 1.5]
-rapidity_range_clean = [-0.5, 5]
-pT_range = [-0.5, 3]
-
-pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
- 1, rapidity_range_clean, pT_range, output_path, ' train')
-
-pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1),
- 1, rapidity_range_clean, pT_range, output_path, ' test')
-
-pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
-  1, rapidity_range_clean, pT_range, output_path, ' deploy')
+# rapidity_range_unclean = [-1.5, 1.5]
+# rapidity_range_clean = [-0.5, 5]
+# pT_range = [-0.5, 3]
+#
+# pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
+#  1, rapidity_range_clean, pT_range, output_path, ' train')
+#
+# pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1),
+#  1, rapidity_range_clean, pT_range, output_path, ' test')
+#
+# pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
+#   1, rapidity_range_clean, pT_range, output_path, ' deploy')

From 1d04b24264d9ecc3583a2965b03d241585174e42 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 00:41:57 +0200
Subject: [PATCH 34/40] Deleted comment

---
 distributions/var_distr.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index 7e4285c..f805a71 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -31,7 +31,6 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
             name of pdf document with distributions
     """
 
-    # fig, ax = plt.subplots(figsize=(20, 10))
 
     fig, ax = plt.subplots(3, figsize=(20, 10))
 

From e5ca48a9e7a043a7496025b58394f195816ad218 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 11:04:17 +0200
Subject: [PATCH 35/40] Added name of the sample (for exapmle, train, test,
 etc)

---
 distributions/twoD_mass_pT.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/distributions/twoD_mass_pT.py b/distributions/twoD_mass_pT.py
index 0d75175..93dc79b 100644
--- a/distributions/twoD_mass_pT.py
+++ b/distributions/twoD_mass_pT.py
@@ -5,7 +5,7 @@
 from matplotlib.backends.backend_pdf import PdfPages
 import matplotlib as mpl
 
-def plot2D(df, sgn,x_axis_value, y_axis_value, range_x, range_y, pdf_key):
+def plot2D(df, sample, sgn,x_axis_value, y_axis_value, range_x, range_y, pdf_key):
     """
 
     Plots 2-D histogram.
@@ -32,18 +32,16 @@ def plot2D(df, sgn,x_axis_value, y_axis_value, range_x, range_y, pdf_key):
 
 
     if sgn==1:
-        plt.title('Signal candidates, I love them', fontsize = 25)
+        plt.title('Signal candidates ' + sample, fontsize = 25)
 
     if sgn==0:
-        plt.title('Background candidates', fontsize = 25)
+        plt.title('Background candidates' + sample, fontsize = 25)
 
 
     plt.xlabel(x_axis_value+unit, fontsize=25)
     plt.ylabel(y_axis_value, fontsize=25)
 
-    plt.rcParams['font.size'] = '25'
-    for label in(axs.get_xticklabels() + axs.get_yticklabels()):
-        label.set_fontsize(25)
+
 
     mpl.pyplot.colorbar()
 

From 3504ee527642d3df6b67ab33f63c93fbf5cbc966 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 11:05:02 +0200
Subject: [PATCH 36/40] Added name of the sample (for exapmle, train, test,
 etc)

---
 distributions/var_distr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index f805a71..be0eeb4 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -15,7 +15,7 @@
 
 mpl.rc('figure', max_open_warning = 0)
 
-def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, pdf_key):
+def hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, pdf_key):
     """
     Applied quality cuts and created distributions for all the features in pdf
     file
@@ -75,7 +75,7 @@ def hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, p
     ax[1].xaxis.set_tick_params(labelsize=15)
     ax[1].yaxis.set_tick_params(labelsize=15)
 
-    ax[1].set_title(feature + ' MC ', fontsize = 25)
+    ax[1].set_title(feature + ' MC '+ sample, fontsize = 25)
     ax[1].set_xlabel(feature, fontsize = 25)
 
     if feature!='mass':

From 3287e0f26ca8a1e760d6f06d0f19e84d31622130 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 11:06:09 +0200
Subject: [PATCH 37/40] Invoked the distributions in the main file

---
 clusterXGB.py | 50 ++++++++++++++++++++++++--------------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 726725f..acdec37 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -316,11 +316,7 @@ def original_SB(df):
 
 
 pdf_2d_mass_feat = PdfPages(output_path+'/'+'2d_mass_feat_train.pdf')
-# for var in vars_to_draw_corr:
-#     plot2D(dfs_orig_train, 'mass', var, [1.108, 1.1227], [dfs_orig_train[var].min(),
-#     dfs_orig_train[var].min()], pdf_2d_mass_feat)
-#
-# pdf_2d_mass_feat.close()
+
 
 
 def dist2D(df, vars,x_range, pdf):
@@ -343,13 +339,10 @@ def dist2D(df, vars,x_range, pdf):
         if var in non_log_x:
             new_log_x.append(var)
 
-    print(df_new.columns)
-    print("DF: ", df_new)
-    print("new log_x: ", new_log_x)
 
     for feat in new_log_x:
 
-        plot2D(df_new, 1, 'mass', feat, x_range,[df_new[feat].min(),
+        plot2D(df_new, ' train',  1, 'mass', feat, x_range,[df_new[feat].min(),
         df_new[feat].max()], pdf)
 
 
@@ -402,7 +395,12 @@ def df_distribution(df, bst_df, df_best):
 
 cut_visualization(deploy_data,'issignalXGB',test_best, output_path)
 
-def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
+def variables_distribution(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts):
+    # dfs_orig_new = dfs_orig.copy()
+    # dfb_orig_new = dfb_orig.copy()
+    # dfs_cut_new = dfs_cut.copy()
+    # dfb_cut_new = dfb_cut.copy()
+
     difference_s = pd.concat([dfs_orig[cuts1], dfs_cut[cuts1]]).drop_duplicates(keep=False)
     non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo',  'mass', 'pT', 'rapidity',
      'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl']
@@ -436,28 +434,28 @@ def variables_distribution(dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_
 
 
     for feat in new_log_x:
-        hist_variables(dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
+        hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts)
 
     pdf_cuts.close()
 
-variables_distribution(dfs_orig_train, dfb_orig_train, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
-variables_distribution(dfs_orig_test, dfb_orig_test, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
-variables_distribution(dfs_orig_d, dfb_orig_d, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)
+variables_distribution(' train', dfs_orig_train, dfb_orig_train, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train)
+variables_distribution(' test', dfs_orig_test, dfb_orig_test, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test)
+variables_distribution(' deploy', dfs_orig_d, dfb_orig_d, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy)
 
 
 def difference_df(df_orig, df_cut, cut):
     return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False)
 
 
-# rapidity_range_unclean = [-1.5, 1.5]
-# rapidity_range_clean = [-0.5, 5]
-# pT_range = [-0.5, 3]
-#
-# pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
-#  1, rapidity_range_clean, pT_range, output_path, ' train')
-#
-# pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1),
-#  1, rapidity_range_clean, pT_range, output_path, ' test')
-#
-# pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
-#   1, rapidity_range_clean, pT_range, output_path, ' deploy')
+rapidity_range_unclean = [-1.5, 1.5]
+rapidity_range_clean = [-0.5, 5]
+pT_range = [-0.5, 3]
+
+pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1),
+ 1, rapidity_range_clean, pT_range, output_path, ' train')
+
+pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1),
+ 1, rapidity_range_clean, pT_range, output_path, ' test')
+
+pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
+  1, rapidity_range_clean, pT_range, output_path, ' deploy')

From 2f9451ce7d854dfacf1a893248c16b275ebc6fa8 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 12:26:34 +0200
Subject: [PATCH 38/40] Added samples's name

---
 distributions/var_distr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/distributions/var_distr.py b/distributions/var_distr.py
index be0eeb4..931e995 100644
--- a/distributions/var_distr.py
+++ b/distributions/var_distr.py
@@ -52,7 +52,7 @@ def hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,fe
     ax[0].xaxis.set_tick_params(labelsize=15)
     ax[0].yaxis.set_tick_params(labelsize=15)
 
-    ax[0].set_title(str(feature) + ' MC ', fontsize = 25)
+    ax[0].set_title(str(feature) + ' MC '+ sample, fontsize = 25)
     ax[0].set_xlabel(feature, fontsize = 25)
 
     if feature!='mass':
@@ -97,7 +97,7 @@ def hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,fe
     ax[2].xaxis.set_tick_params(labelsize=15)
     ax[2].yaxis.set_tick_params(labelsize=15)
 
-    ax[2].set_title(feature + ' MC ', fontsize = 25)
+    ax[2].set_title(feature + ' MC '+ sample, fontsize = 25)
     ax[2].set_xlabel(feature, fontsize = 25)
 
     if feature!='mass':

From a88ca356d4c5266bb0762300b306cf14ee5b8322 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Fri, 20 Aug 2021 15:22:36 +0200
Subject: [PATCH 39/40] Preliminary Visualization of the BDT trees architecture

---
 clusterXGB.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clusterXGB.py b/clusterXGB.py
index acdec37..231c46c 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -257,6 +257,12 @@ def get_best_params():
 #The first argument should be a data frame, the second a column in it, in the form 'preds'
 preds_prob(bst_test,'xgb_preds', 'issignal','test', output_path)
 
+xgb.plot_tree(bst, num_trees=0)
+fig = plt.gcf()
+fig.set_size_inches(150, 100)
+fig.savefig(output_path+'/tree.png')
+
+
 def CM_plot(best, x, output_path):
     """
     Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to

From e9c14bf267456daa5416825aea87aa811e148508 Mon Sep 17 00:00:00 2001
From: conformist89 <olhalavoryk@gmail.com>
Date: Mon, 23 Aug 2021 12:11:05 +0200
Subject: [PATCH 40/40] Invoked 2D pT-rapidity and feature-mass distributions

---
 clusterXGB.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/clusterXGB.py b/clusterXGB.py
index 231c46c..41ee6f7 100644
--- a/clusterXGB.py
+++ b/clusterXGB.py
@@ -321,11 +321,12 @@ def original_SB(df):
 
 
 
-pdf_2d_mass_feat = PdfPages(output_path+'/'+'2d_mass_feat_train.pdf')
+pdf_2d_mass_feat_sign = PdfPages(output_path+'/'+'2d_mass_feat_sign_train.pdf')
+pdf_2d_mass_feat_bgr = PdfPages(output_path+'/'+'2d_mass_feat_bgr_train.pdf')
 
 
 
-def dist2D(df, vars,x_range, pdf):
+def dist2D(df, vars,x_range, sign, pdf):
 
     df_new = df.copy()
 
@@ -348,14 +349,23 @@ def dist2D(df, vars,x_range, pdf):
 
     for feat in new_log_x:
 
-        plot2D(df_new, ' train',  1, 'mass', feat, x_range,[df_new[feat].min(),
+        plot2D(df_new, ' train',  sign, 'mass', feat, x_range,[df_new[feat].min(),
         df_new[feat].max()], pdf)
 
 
     pdf.close()
 
+
+
+
 dist2D(dfs_orig_train,  ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance',  'ldl', 'rapidity',
-   'pT'], [1.108, 1.1227], pdf_2d_mass_feat)
+   'pT'], [1.108, 1.1227], 1, pdf_2d_mass_feat_sign)
+
+
+dist2D(dfb_orig_train,  ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance',  'ldl', 'rapidity',
+   'pT'], [1, 1.5], 0, pdf_2d_mass_feat_bgr)
+
+
 
 def df_distribution(df, bst_df, df_best):
     df['issignalXGB'] = bst_df['xgb_preds'].values
@@ -465,3 +475,14 @@ def difference_df(df_orig, df_cut, cut):
 
 pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1),
   1, rapidity_range_clean, pT_range, output_path, ' deploy')
+
+
+
+pT_vs_rapidity(dfb_orig_train, dfb_cut_train, difference_df(dfb_orig_train, dfb_cut_train, cuts1),
+ 0, rapidity_range_clean, pT_range, output_path, ' train')
+
+pT_vs_rapidity(dfb_orig_test, dfb_cut_test, difference_df(dfb_orig_test, dfb_cut_test, cuts1),
+ 0, rapidity_range_clean, pT_range, output_path, ' test')
+
+pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfb_orig_d, dfb_cut_d, cuts1),
+  0, rapidity_range_clean, pT_range, output_path, ' deploy')