diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5f0f04 --- /dev/null +++ b/.gitignore @@ -0,0 +1,144 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PNG images +*.png + +# PDF files +*.pdf diff --git a/CBM_ML_XGB_package.py b/CBM_ML_XGB_package.py deleted file mode 100644 index bb56c20..0000000 --- a/CBM_ML_XGB_package.py +++ /dev/null @@ -1,213 +0,0 @@ -#We import some libraries -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import xgboost as xgb -from sklearn.model_selection import train_test_split -from sklearn.metrics import confusion_matrix -from sklearn.model_selection import RandomizedSearchCV, cross_val_score -from scipy.stats import uniform -import weakref -from bayes_opt import BayesianOptimization -from data_cleaning import clean_df -from KFPF_lambda_cuts import KFPF_lambda_cuts -from plot_tools import AMS, preds_prob, plot_confusion_matrix -import gc -from tree_importer import tree_importer - -#To save some memory we will delete unused variables -class TestClass(object): - def check(self): - print ("object is alive!") - def __del__(self): - print ("object deleted") - -#to paralellize some part of the code -from concurrent.futures import ThreadPoolExecutor -executor = ThreadPoolExecutor(8) - - -# We import three root files into our jupyter notebook -signal= tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeSignal.root','PlainTree') - -# We only select lambda candidates -sgnal = signal[(signal['LambdaCandidates_is_signal']==1) & (signal['LambdaCandidates_mass']>1.108) - & (signal['LambdaCandidates_mass']<1.1227)] - -# Similarly for the background -background = tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeBackground.root','PlainTree') -bg = background[(background['LambdaCandidates_is_signal'] == 0) - & ((background['LambdaCandidates_mass'] > 1.07) - & (background['LambdaCandidates_mass'] < 1.108) | (background['LambdaCandidates_mass']>1.1227) - & (background['LambdaCandidates_mass'] < 2.00))] - -#delete unused variables -del signal -del background - -#we also import a 10k events data set, generated using URQMD with AuAu collisions at 12AGeV -file = tree_importer('/home/shahid/cbmsoft/Data/10k_events_PFSimplePlainTree.root','PlainTree') -#Call the python garbage collector to clean up things -gc.collect() -df_original= pd.DataFrame(data=file) -del file - -#The labels of the columns in the df data frame are having the prefix LambdaCandidates_ so we rename them -new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg', - 'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl', - 'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity', - 'x', 'y', 'z', 'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal'] - -sgnal.columns = new_labels -bg.columns = new_labels -df_original.columns=new_labels - -# Next we clean the data using the clean_df function saved in another .py file - -#Creating a new data frame and saving the results in it after cleaning of the original dfs -#Also keeping the original one -bcknd = clean_df(bg) -signal = clean_df(sgnal) - -del bg -del sgnal -gc.collect() - -df_clean = clean_df(df_original) -del df_original -gc.collect() - -# We randomly choose our signal set of 4000 candidates -signal_selected= signal.sample(n=90000) - -#background = 3 times the signal is also done randomly -background_selected = bcknd - -del signal -del bcknd - -#Let's combine signal and background -dfs = [signal_selected, background_selected] -df_scaled = pd.concat(dfs) -# Let's shuffle the rows randomly -df_scaled = df_scaled.sample(frac=1) -del dfs - - -# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not -cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo'] -x = df_scaled[cuts].copy() - -# The MC information is saved in this y variable -y =pd.DataFrame(df_scaled['issignal'], dtype='int') - -#We do the same for the 10k events data set -x_whole = df_clean[cuts].copy() -y_whole = pd.DataFrame(df_clean['issignal'], dtype='int') - - -#Creating a train and test set from the signal and background combined data set -x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324) - -dtrain = xgb.DMatrix(x_train, label = y_train) -dtest = xgb.DMatrix(x_whole, label = y_whole) -dtest1=xgb.DMatrix(x_test, label = y_test) - -#Bayesian Optimization function for xgboost -#specify the parameters you want to tune as keyword arguments -def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate): - params = {'max_depth': int(max_depth), - 'gamma': gamma, - 'alpha':alpha, - 'n_estimators': n_estimators, - 'learning_rate':learning_rate, - 'subsample': 0.8, - 'eta': 0.1, - 'eval_metric': 'auc', 'nthread' : 7} - cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5) - return cv_result['test-auc-mean'].iloc[-1] - - -#Invoking the Bayesian Optimizer with the specified parameters to tune -xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10), - 'gamma': (0, 1), - 'alpha': (2,20), - 'learning_rate':(0,1), - 'n_estimators':(100,500) - }) - -#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement -xgb_bo.maximize(n_iter=5, init_points=8, acq='ei') - -max_param = xgb_bo.max['params'] -param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'binary:logistic'} - -bst = xgb.train(param, dtrain) -bst1= bst.predict(dtrain) - -bst_test = pd.DataFrame(data=bst.predict(dtest1), columns=["xgb_preds"]) -y_test=y_test.set_index(np.arange(0,bst_test.shape[0])) -bst_test['issignal']=y_test['issignal'] - -#To visualize the predictions of the classifier in terms of probabilities -#The first argument should be a data frame, the second a column in it, in the form 'preds' -preds_prob(bst_test,'xgb_preds', 'issignal') - -#To choose the best threshold -train_best, test_best = AMS(y_train, bst1,y_test, bst_test['xgb_preds']) - -#Applying XGB on the 10k events data-set -df_clean['xgb_preds'] = bst.predict(dtest) - -#plot confusion matrix -cut1 = test_best -df_clean['xgb_preds1'] = ((df_clean['xgb_preds']>cut1)*1) -cnf_matrix = confusion_matrix(y_whole, df_clean['xgb_preds1'], labels=[1,0]) -#cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0]) -np.set_printoptions(precision=2) -fig, axs = plt.subplots(figsize=(10, 8)) -axs.yaxis.set_label_coords(-0.04,.5) -axs.xaxis.set_label_coords(0.5,-.005) -plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1)) - - -#comparison with manual cuts -#returns a new df -new_check_set=KFPF_lambda_cuts(df_clean) - - -cut3 = test_best -mask1 = df_clean['xgb_preds']>cut3 -df3_base=df_clean[mask1] -from matplotlib import gridspec -range1= (1.0999, 1.17) -fig, axs = plt.subplots(2, 1,figsize=(15,10), sharex=True, gridspec_kw={'width_ratios': [10], - 'height_ratios': [8,4]}) - -ns, bins, patches=axs[0].hist((df3_base['mass']),bins = 300, range=range1, facecolor='red',alpha = 0.3) -ns1, bins1, patches1=axs[0].hist((new_check_set['mass']),bins = 300, range=range1,facecolor='blue',alpha = 0.3) - -axs[0].set_ylabel("counts", fontsize = 15) -axs[0].legend(('XGBoost Selected $\Lambda$s','KFPF selected $\Lambda$s'), fontsize = 15, loc='upper right') - -#plt.rcParams["legend.loc"] = 'upper right' -axs[0].set_title("The lambda's Invariant Mass histogram with KFPF and XGB selection criteria on KFPF variables", fontsize = 15) -axs[0].grid() -axs[0].tick_params(axis='both', which='major', labelsize=15) - - -hist1, bin_edges1 = np.histogram(df3_base['mass'],range=(1.09, 1.17), bins=300) -hist2, bin_edges2 = np.histogram(new_check_set['mass'],range=(1.09, 1.17), bins=300) - -#makes sense to have only positive values -diff = (hist1 - hist2) -axs[1].bar(bins[:-1], # this is what makes it comparable - ns / ns1, # maybe check for div-by-zero! - width=0.001) -plt.xlabel("Mass in $\dfrac{GeV}{c^2}$", fontsize = 15) -axs[1].set_ylabel("XGB / KFPF", fontsize = 15) -axs[1].grid() -axs[1].tick_params(axis='both', which='major', labelsize=15) - -plt.show() -fig.tight_layout() diff --git a/README.md b/README.md index 067fbe7..52908cb 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# XGB +# XGB This repsoitory contains the basic code of machine learning package XGBoost applied to lambda reconstruction at the CBM experiment at GSI/FAIR. The code is split into different files so that it is more readible. The library directory contains various files that could be installed locally. For further information visit that directory. @@ -6,3 +6,6 @@ The code is split into different files so that it is more readible. The library We have created a Google Colab for our this code and it has detailed information on what has been done in this code and what was the motivation behind it. The colab is available on the following link https://colab.research.google.com/drive/1Up8YvcWiYv0NN7nOA5YhlpsiqTsY0eCt?usp=sharing +# Usage + +python clusterXGB.py /path/to_signal.root /path/to_background.root output_path diff --git a/The CBM ML-XGB package.ipynb b/The CBM ML-XGB package.ipynb deleted file mode 100644 index 92302d9..0000000 --- a/The CBM ML-XGB package.ipynb +++ /dev/null @@ -1,246 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#We import some libraries\n", - "import numpy as np\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "import xgboost as xgb\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.metrics import confusion_matrix\n", - "from sklearn.model_selection import RandomizedSearchCV, cross_val_score\n", - "from scipy.stats import uniform\n", - "import weakref \n", - "from bayes_opt import BayesianOptimization\n", - "from data_cleaning import clean_df\n", - "from KFPF_lambda_cuts import KFPF_lambda_cuts\n", - "from plot_tools import AMS, preds_prob, plot_confusion_matrix\n", - "import gc\n", - "from tree_importer import tree_importer\n", - "\n", - "#To save some memory we will delete unused variables\n", - "class TestClass(object): \n", - " def check(self): \n", - " print (\"object is alive!\") \n", - " def __del__(self): \n", - " print (\"object deleted\") \n", - " \n", - "#to paralellize some part of the code\n", - "from concurrent.futures import ThreadPoolExecutor\n", - "executor = ThreadPoolExecutor(8)\n", - "\n", - "\n", - "# We import three root files into our jupyter notebook\n", - "signal= tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeSignal.root','PlainTree')\n", - "\n", - "# We only select lambda candidates\n", - "sgnal = signal[(signal['LambdaCandidates_is_signal']==1) & (signal['LambdaCandidates_mass']>1.108)\n", - " & (signal['LambdaCandidates_mass']<1.1227)]\n", - "\n", - "# Similarly for the background\n", - "background = tree_importer('/home/shahid/cbmsoft/Data/PFSimplePlainTreeBackground.root','PlainTree')\n", - "bg = background[(background['LambdaCandidates_is_signal'] == 0)\n", - " & ((background['LambdaCandidates_mass'] > 1.07)\n", - " & (background['LambdaCandidates_mass'] < 1.108) | (background['LambdaCandidates_mass']>1.1227) \n", - " & (background['LambdaCandidates_mass'] < 2.00))]\n", - "\n", - "#delete unused variables\n", - "del signal\n", - "del background\n", - "\n", - "#we also import a 10k events data set, generated using URQMD with AuAu collisions at 12AGeV\n", - "file = tree_importer('/home/shahid/cbmsoft/Data/10k_events_PFSimplePlainTree.root','PlainTree')\n", - "#Call the python garbage collector to clean up things\n", - "gc.collect()\n", - "df_original= pd.DataFrame(data=file)\n", - "del file\n", - "\n", - "#The labels of the columns in the df data frame are having the prefix LambdaCandidates_ so we rename them\n", - "new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'cosineneg',\n", - " 'cosinepos', 'cosinetopo', 'distance', 'eta', 'l', 'ldl',\n", - " 'mass', 'p', 'pT', 'phi', 'px', 'py', 'pz', 'rapidity',\n", - " 'x', 'y', 'z', 'daughter1id', 'daughter2id', 'isfrompv', 'pid', 'issignal']\n", - "\n", - "sgnal.columns = new_labels\n", - "bg.columns = new_labels\n", - "df_original.columns=new_labels\n", - "\n", - "# Next we clean the data using the clean_df function saved in another .py file\n", - "\n", - "#Creating a new data frame and saving the results in it after cleaning of the original dfs\n", - "#Also keeping the original one\n", - "bcknd = clean_df(bg)\n", - "signal = clean_df(sgnal)\n", - "\n", - "del bg\n", - "del sgnal\n", - "gc.collect()\n", - "\n", - "df_clean = clean_df(df_original)\n", - "del df_original\n", - "gc.collect()\n", - "\n", - "# We randomly choose our signal set of 4000 candidates\n", - "signal_selected= signal.sample(n=90000)\n", - "\n", - "#background = 3 times the signal is also done randomly\n", - "background_selected = bcknd\n", - "\n", - "del signal\n", - "del bcknd\n", - "\n", - "#Let's combine signal and background\n", - "dfs = [signal_selected, background_selected]\n", - "df_scaled = pd.concat(dfs)\n", - "# Let's shuffle the rows randomly\n", - "df_scaled = df_scaled.sample(frac=1)\n", - "del dfs\n", - "\n", - "\n", - "# The following columns will be used to predict whether a reconstructed candidate is a lambda particle or not\n", - "cuts = [ 'chi2primneg', 'chi2primpos', 'ldl', 'distance', 'chi2geo']\n", - "x = df_scaled[cuts].copy()\n", - "\n", - "# The MC information is saved in this y variable\n", - "y =pd.DataFrame(df_scaled['issignal'], dtype='int')\n", - "\n", - "#We do the same for the 10k events data set\n", - "x_whole = df_clean[cuts].copy()\n", - "y_whole = pd.DataFrame(df_clean['issignal'], dtype='int')\n", - "\n", - "\n", - "#Creating a train and test set from the signal and background combined data set\n", - "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324)\n", - "\n", - "dtrain = xgb.DMatrix(x_train, label = y_train)\n", - "dtest = xgb.DMatrix(x_whole, label = y_whole)\n", - "dtest1=xgb.DMatrix(x_test, label = y_test)\n", - "\n", - "#Bayesian Optimization function for xgboost\n", - "#specify the parameters you want to tune as keyword arguments\n", - "def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate):\n", - " params = {'max_depth': int(max_depth),\n", - " 'gamma': gamma,\n", - " 'alpha':alpha,\n", - " 'n_estimators': n_estimators,\n", - " 'learning_rate':learning_rate,\n", - " 'subsample': 0.8,\n", - " 'eta': 0.1,\n", - " 'eval_metric': 'auc', 'nthread' : 7}\n", - " cv_result = xgb.cv(params, dtrain, num_boost_round=70, nfold=5)\n", - " return cv_result['test-auc-mean'].iloc[-1]\n", - "\n", - "\n", - "#Invoking the Bayesian Optimizer with the specified parameters to tune\n", - "xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10),\n", - " 'gamma': (0, 1),\n", - " 'alpha': (2,20),\n", - " 'learning_rate':(0,1),\n", - " 'n_estimators':(100,500)\n", - " })\n", - "\n", - "#performing Bayesian optimization for 5 iterations with 8 steps of random exploration with an #acquisition function of expected improvement\n", - "xgb_bo.maximize(n_iter=5, init_points=8, acq='ei')\n", - "\n", - "max_param = xgb_bo.max['params']\n", - "param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), 'objective': 'binary:logistic'}\n", - "\n", - "bst = xgb.train(param, dtrain)\n", - "bst1= bst.predict(dtrain)\n", - "\n", - "bst_test = pd.DataFrame(data=bst.predict(dtest1), columns=[\"xgb_preds\"])\n", - "y_test=y_test.set_index(np.arange(0,bst_test.shape[0]))\n", - "bst_test['issignal']=y_test['issignal']\n", - "\n", - "#To visualize the predictions of the classifier in terms of probabilities\n", - "#The first argument should be a data frame, the second a column in it, in the form 'preds'\n", - "preds_prob(bst_test,'xgb_preds', 'issignal')\n", - "\n", - "#To choose the best threshold\n", - "train_best, test_best = AMS(y_train, bst1,y_test, bst_test['xgb_preds'])\n", - "\n", - "#Applying XGB on the 10k events data-set\n", - "df_clean['xgb_preds'] = bst.predict(dtest)\n", - "\n", - "#plot confusion matrix\n", - "cut1 = test_best\n", - "df_clean['xgb_preds1'] = ((df_clean['xgb_preds']>cut1)*1)\n", - "cnf_matrix = confusion_matrix(y_whole, df_clean['xgb_preds1'], labels=[1,0])\n", - "#cnf_matrix = confusion_matrix(new_check_set['issignal'], new_check_set['new_signal'], labels=[1,0])\n", - "np.set_printoptions(precision=2)\n", - "fig, axs = plt.subplots(figsize=(10, 8))\n", - "axs.yaxis.set_label_coords(-0.04,.5)\n", - "axs.xaxis.set_label_coords(0.5,-.005)\n", - "plot_confusion_matrix(cnf_matrix, classes=['signal','background'], title='Confusion Matrix for XGB for cut > '+str(cut1))\n", - "\n", - "\n", - "#comparison with manual cuts\n", - "#returns a new df \n", - "new_check_set=KFPF_lambda_cuts(df_clean)\n", - "\n", - "\n", - "cut3 = test_best\n", - "mask1 = df_clean['xgb_preds']>cut3\n", - "df3_base=df_clean[mask1]\n", - "from matplotlib import gridspec\n", - "range1= (1.0999, 1.17)\n", - "fig, axs = plt.subplots(2, 1,figsize=(15,10), sharex=True, gridspec_kw={'width_ratios': [10],\n", - " 'height_ratios': [8,4]})\n", - "\n", - "ns, bins, patches=axs[0].hist((df3_base['mass']),bins = 300, range=range1, facecolor='red',alpha = 0.3)\n", - "ns1, bins1, patches1=axs[0].hist((new_check_set['mass']),bins = 300, range=range1,facecolor='blue',alpha = 0.3)\n", - "\n", - "axs[0].set_ylabel(\"counts\", fontsize = 15)\n", - "axs[0].legend(('XGBoost Selected $\\Lambda$s','KFPF selected $\\Lambda$s'), fontsize = 15, loc='upper right')\n", - "\n", - "#plt.rcParams[\"legend.loc\"] = 'upper right'\n", - "axs[0].set_title(\"The lambda's Invariant Mass histogram with KFPF and XGB selection criteria on KFPF variables\", fontsize = 15)\n", - "axs[0].grid()\n", - "axs[0].tick_params(axis='both', which='major', labelsize=15)\n", - "\n", - "\n", - "hist1, bin_edges1 = np.histogram(df3_base['mass'],range=(1.09, 1.17), bins=300)\n", - "hist2, bin_edges2 = np.histogram(new_check_set['mass'],range=(1.09, 1.17), bins=300)\n", - "\n", - "#makes sense to have only positive values \n", - "diff = (hist1 - hist2)\n", - "axs[1].bar(bins[:-1], # this is what makes it comparable\n", - " ns / ns1, # maybe check for div-by-zero!\n", - " width=0.001)\n", - "plt.xlabel(\"Mass in $\\dfrac{GeV}{c^2}$\", fontsize = 15)\n", - "axs[1].set_ylabel(\"XGB / KFPF\", fontsize = 15)\n", - "axs[1].grid()\n", - "axs[1].tick_params(axis='both', which='major', labelsize=15)\n", - "\n", - "plt.show()\n", - "fig.tight_layout()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.6" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/clusterXGB.py b/clusterXGB.py new file mode 100644 index 0000000..41ee6f7 --- /dev/null +++ b/clusterXGB.py @@ -0,0 +1,488 @@ +import numpy as np +import pandas as pd +import xgboost as xgb +import matplotlib.pyplot as plt + +from library.CBM_ML.tree_importer import tree_importer +from library.CBM_ML.plot_tools import AMS, preds_prob,plot_confusion_matrix, cut_visualization + +from sklearn.model_selection import train_test_split + +from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score + +from sklearn.tree import plot_tree +from sklearn.tree import export_graphviz + +from sklearn.model_selection import cross_val_score +from scipy.stats import uniform + +from numpy import sqrt, log, argmax + +from bayes_opt import BayesianOptimization + +import gc + +import sys +import os + +from distributions.var_distr import hist_variables +from distributions.var1Dcorr import vars, calculate_correlation, plot1Dcorrelation +from distributions.pt_rapidity import pT_vs_rapidity +from distributions.twoD_mass_pT import plot2D +from matplotlib.backends.backend_pdf import PdfPages + + +tree_name = 'PlainTree' + +path_list = [] + +# How many threads we use to parallel code +number_of_threads = 3 + +for x in sys.argv[1:]: + path_list.append(x) + +signal_path = path_list[0] +df_urqmd_path = path_list[1] + +output_path = path_list[2] + + +if not os.path.exists(output_path): + os.mkdir(output_path) + +def data_selection(signal_path, bgr_path, tree, threads): + + """ + We have selected signal candidates only from the DCM model, therefore, we are + treating it as simulated data. The URQMD data set will be treated as real + experimental data. + + Our URQMD 100k events data, which looks more like what we will get from the + final experiment, has a lot more background than signal. This problem of + unequal ratio of classes (signal and background) in our data set (URQMD, + 99.99% background and less than 1% signal) is called imbalance classification problem. + + One of the solutions to tackle this problem is resampling the data. + Deleting instances from the over-represented class (in our case the background), + under-sampling, is a resampling method. + + So for training and testing we will get signal candidates from the DCM signal + and background from URQMD (3 times signal size). + + Parameters + ---------- + signal_path: str + path to signal file + bgr_path: str + path to background file + tree: str + name of flat tree + threads: int + how many parallel thread we want + """ + signal= tree_importer(signal_path,tree_name, threads) + df_urqmd = tree_importer(bgr_path, tree_name, threads) + + + signal_selected = signal + + background_selected = df_urqmd[(df_urqmd['issignal'] == 0) + & ((df_urqmd['mass'] > 1.07) + & (df_urqmd['mass'] < 1.108) | (df_urqmd['mass']>1.1227) + & (df_urqmd['mass'] < 1.3))].sample(n=3*(signal.shape[0])) + + print("Signal: ", len(signal_selected)) + print("background: ", len(background_selected)) + + + df_scaled = pd.concat([signal_selected, background_selected]) + df_scaled = df_scaled.sample(frac=1) + df_scaled.iloc[0:10,:] + del signal, background_selected + + return df_scaled + + + +df_scaled = data_selection(signal_path, df_urqmd_path, tree_name, + number_of_threads) + +print("Df scaled: ", len(df_scaled)) + + + +# features to be trained +cuts = [ 'chi2primneg', 'chi2primpos','chi2geo','distance', 'ldl'] + + +def train_test_set(df_scaled, cuts): + """ + To make machine learning algorithms more efficient on unseen data we divide + our data into two sets. One set is for training the algorithm and the other + is for testing the algorithm. If we don't do this then the algorithm can + overfit and we will not capture the general trends in the data. + + Parameters + ---------- + df_scaled: dataframe + dataframe with mixed signal and background + cuts: list(contains strings) + features on which training is based + """ + # x = df_scaled[cuts].copy() + x = df_scaled.copy() + + # The MC information is saved in this y variable + y =pd.DataFrame(df_scaled['issignal'], dtype='int') + x_train_all, x_test_all, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=324) + + + x_train = x_train_all[cuts].copy() + x_test = x_test_all[cuts].copy() + #DMatrix is a internal data structure that used by XGBoost + # which is optimized for both memory efficiency and training speed. + dtrain = xgb.DMatrix(x_train, label = y_train) + dtest1=xgb.DMatrix(x_test, label = y_test) + + return dtrain, dtest1,x_train_all,x_test_all,x_train, x_test, y_train, y_test + + +dtrain, dtest1,x_train_all, x_test_all, x_train,x_test, y_train, y_test = train_test_set(df_scaled, cuts) + + +del df_scaled +gc.collect() + +# open deploy dataframe with x and y as well +def open_deploy_data(deploy_path, tree, threads): + deploy_data= tree_importer(deploy_path,tree_name, threads) + + return deploy_data + + +def deploy_set(deploy_data, cuts): + x_deploy = deploy_data[cuts].copy() + y_deploy =pd.DataFrame(deploy_data['issignal'], dtype='int') + + ddeploy=xgb.DMatrix(x_deploy, label = y_deploy) + + return ddeploy, x_deploy, y_deploy + + +deploy_data = open_deploy_data(df_urqmd_path, tree_name, number_of_threads) + +ddeploy, x_deploy, y_deploy = deploy_set(deploy_data, cuts) + + +#Bayesian Optimization function for xgboost +#specify the parameters you want to tune as keyword arguments +def bo_tune_xgb(max_depth, gamma, alpha, n_estimators ,learning_rate): + params = {'max_depth': int(max_depth), + 'gamma': gamma, + 'alpha':alpha, + 'n_estimators': n_estimators, + 'learning_rate':learning_rate, + 'subsample': 0.8, + 'eta': 0.3, + 'eval_metric': 'auc', 'nthread' : 7} + cv_result = xgb.cv(params, dtrain, num_boost_round=10, nfold=5) + return cv_result['test-auc-mean'].iloc[-1] + + + +def get_best_params(): + """ + Performs Bayesian Optimization and looks for the best parameters + + Parameters: + None + """ + #Invoking the Bayesian Optimizer with the specified parameters to tune + xgb_bo = BayesianOptimization(bo_tune_xgb, {'max_depth': (4, 10), + 'gamma': (0, 1), + 'alpha': (2,20), + 'learning_rate':(0,1), + 'n_estimators':(100,500) + }) + #performing Bayesian optimization for 5 iterations with 8 steps of random exploration + # with an #acquisition function of expected improvement + xgb_bo.maximize(n_iter=1, init_points=1) + + max_param = xgb_bo.max['params'] + param= {'alpha': max_param['alpha'], 'gamma': max_param['gamma'], 'learning_rate': max_param['learning_rate'], + 'max_depth': int(round(max_param['max_depth'],0)), 'n_estimators': int(round(max_param['n_estimators'],0)), + 'objective': 'reg:logistic'} + gc.collect() + + + #To train the algorithm using the parameters selected by bayesian optimization + #Fit/train on training data + bst = xgb.train(param, dtrain) + return bst + + +bst = get_best_params() + +#predicitions on training set +bst_train= pd.DataFrame(data=bst.predict(dtrain, output_margin=False), columns=["xgb_preds"]) +y_train=y_train.set_index(np.arange(0,bst_train.shape[0])) +bst_train['issignal']=y_train['issignal'] + + +#predictions on test set +bst_test = pd.DataFrame(data=bst.predict(dtest1, output_margin=False), columns=["xgb_preds"]) +y_test=y_test.set_index(np.arange(0,bst_test.shape[0])) +bst_test['issignal']=y_test['issignal'] + + +bst_deploy = pd.DataFrame(data=bst.predict(ddeploy, output_margin=False), columns=["xgb_preds"]) +y_deploy=y_deploy.set_index(np.arange(0,bst_deploy.shape[0])) +bst_deploy['issignal']=y_deploy['issignal'] + + + +#The following graph will show us that which features are important for the model +ax = xgb.plot_importance(bst) +plt.rcParams['figure.figsize'] = [6, 3] +plt.show() +ax.figure.tight_layout() +ax.figure.savefig(str(output_path)+"/hits.png") + + +#ROC cures for the predictions on train and test sets +train_best, test_best = AMS(y_train, bst_train['xgb_preds'],y_test, bst_test['xgb_preds'], output_path) + + +#The first argument should be a data frame, the second a column in it, in the form 'preds' +preds_prob(bst_test,'xgb_preds', 'issignal','test', output_path) + +xgb.plot_tree(bst, num_trees=0) +fig = plt.gcf() +fig.set_size_inches(150, 100) +fig.savefig(output_path+'/tree.png') + + +def CM_plot(best, x, output_path): + """ + Plots confusion matrix. A Confusion Matrix C is such that Cij is equal to + the number of observations known to be in group i and predicted to be in + group j. Thus in binary classification, the count of true positives is C00, + false negatives C01,false positives is C10, and true neagtives is C11. + + Confusion matrix is applied to previously unseen by model data, so we can + estimate model's performance + + Parameters + ---------- + test_best: numpy.float32 + best threshold + + x_train: dataframe + we want to get confusion matrix on training datasets + """ + #lets take the best threshold and look at the confusion matrix + cut1 = best + x['xgb_preds1'] = ((x['xgb_preds']>cut1)*1) + cnf_matrix = confusion_matrix(x['issignal'], x['xgb_preds1'], labels=[1,0]) + np.set_printoptions(precision=2) + fig, axs = plt.subplots(figsize=(10, 8)) + axs.yaxis.set_label_coords(-0.04,.5) + axs.xaxis.set_label_coords(0.5,-.005) + plot_confusion_matrix(cnf_matrix, classes=['signal','background'], + title='Confusion Matrix for XGB for cut > '+str(cut1)) + plt.savefig(str(output_path)+'/confusion_matrix_extreme_gradient_boosting_whole_data.png') + + +CM_plot(test_best, bst_test, output_path) + +def original_SB(df): + dfs_orig = df[df['issignal']==1] + dfb_orig = df[df['issignal']==0] + + return dfs_orig, dfb_orig + + +dfs_orig_train, dfb_orig_train = original_SB(x_train_all) +dfs_orig_test, dfb_orig_test = original_SB(x_test_all) +dfs_orig_d, dfb_orig_d = original_SB(deploy_data) + +dfs_orig = deploy_data[deploy_data['issignal']==1] +dfb_orig = deploy_data[deploy_data['issignal']==0] + +vars_to_draw_corr = vars(dfs_orig, 'mass') + +corr_signal, corr_signal_errors = calculate_correlation(dfs_orig, vars_to_draw_corr, 'mass') +corr_bg, corr_bg_errors = calculate_correlation(dfb_orig, vars_to_draw_corr, 'mass') + +plot1Dcorrelation(vars_to_draw_corr,'mass', corr_signal, corr_signal_errors, + corr_bg, corr_bg_errors, output_path) + + + + +pdf_2d_mass_feat_sign = PdfPages(output_path+'/'+'2d_mass_feat_sign_train.pdf') +pdf_2d_mass_feat_bgr = PdfPages(output_path+'/'+'2d_mass_feat_bgr_train.pdf') + + + +def dist2D(df, vars,x_range, sign, pdf): + + df_new = df.copy() + + non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo', 'mass', 'pT', 'rapidity', + 'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl'] + + log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance'] + + new_log_x = [] + + for var in vars: + if var in log_x: + df_new[var+'_log'] = np.log(df_new[var]) + df_new = df_new.drop([var], axis=1) + new_log_x.append(var+'_log') + + if var in non_log_x: + new_log_x.append(var) + + + for feat in new_log_x: + + plot2D(df_new, ' train', sign, 'mass', feat, x_range,[df_new[feat].min(), + df_new[feat].max()], pdf) + + + pdf.close() + + + + +dist2D(dfs_orig_train, ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance', 'ldl', 'rapidity', + 'pT'], [1.108, 1.1227], 1, pdf_2d_mass_feat_sign) + + +dist2D(dfb_orig_train, ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance', 'ldl', 'rapidity', + 'pT'], [1, 1.5], 0, pdf_2d_mass_feat_bgr) + + + +def df_distribution(df, bst_df, df_best): + df['issignalXGB'] = bst_df['xgb_preds'].values + df['xgb_preds1'] = ((df['issignalXGB']>df_best)*1) + + + dfs_cut = df[(df['xgb_preds1']==1) & (df['issignal']==1)] + dfb_cut = df[(df['xgb_preds1']==1) & (df['issignal']==0)] + + + return dfs_cut, dfb_cut + + + +dfs_cut_train, dfb_cut_train = df_distribution(x_train_all, + bst_train, train_best) + + +dfs_cut_test, dfb_cut_test = df_distribution(x_test_all, + bst_test, test_best) + + +dfs_cut_d, dfb_cut_d = df_distribution(deploy_data, + bst_deploy, test_best) + + +non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo', 'mass', 'pT', 'rapidity', + 'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl'] + +log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance'] + +new_log_x = [] + +cuts1 = ['chi2geo', 'chi2primneg', 'chi2primpos', 'distance', 'ldl', 'rapidity', + 'mass','pT'] + + + +pdf_cuts_train = PdfPages(output_path+'/'+'dist_cuts_urqmd_train.pdf') +pdf_cuts_test = PdfPages(output_path+'/'+'dist_cuts_urqmd_test.pdf') +pdf_cuts_deploy = PdfPages(output_path+'/'+'dist_cuts_urqmd_deploy.pdf') + + +cut_visualization(deploy_data,'issignalXGB',test_best, output_path) + +def variables_distribution(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, cuts_plot, pdf_cuts): + # dfs_orig_new = dfs_orig.copy() + # dfb_orig_new = dfb_orig.copy() + # dfs_cut_new = dfs_cut.copy() + # dfb_cut_new = dfb_cut.copy() + + difference_s = pd.concat([dfs_orig[cuts1], dfs_cut[cuts1]]).drop_duplicates(keep=False) + non_log_x = ['cosineneg', 'cosinepos', 'cosinetopo', 'mass', 'pT', 'rapidity', + 'phi', 'eta', 'x', 'y','z', 'px', 'py', 'pz', 'l', 'ldl'] + + log_x = ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance'] + + new_log_x = [] + + for cut in cuts_plot: + if cut in log_x: + dfs_orig[cut+'_log'] = np.log(dfs_orig[cut]) + dfb_orig[cut+'_log'] = np.log(dfb_orig[cut]) + + dfs_cut[cut+'_log'] = np.log(dfs_cut[cut]) + dfb_cut[cut+'_log'] = np.log(dfb_cut[cut]) + + difference_s[cut+'_log'] = np.log(difference_s[cut]) + + new_log_x.append(cut+'_log') + + + dfs_orig = dfs_orig.drop([cut], axis=1) + dfb_orig = dfb_orig.drop([cut], axis=1) + + dfs_cut = dfs_cut.drop([cut], axis=1) + dfb_cut = dfb_cut.drop([cut], axis=1) + difference_s = difference_s.drop([cut], axis=1) + + if cut in non_log_x: + new_log_x.append(cut) + + + for feat in new_log_x: + hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s, feat, pdf_cuts) + + pdf_cuts.close() + +variables_distribution(' train', dfs_orig_train, dfb_orig_train, dfs_cut_train, dfb_cut_train, cuts1, pdf_cuts_train) +variables_distribution(' test', dfs_orig_test, dfb_orig_test, dfs_cut_test, dfb_cut_test, cuts1, pdf_cuts_test) +variables_distribution(' deploy', dfs_orig_d, dfb_orig_d, dfs_cut_d, dfb_cut_d, cuts1, pdf_cuts_deploy) + + +def difference_df(df_orig, df_cut, cut): + return pd.concat([df_orig[cut], df_cut[cut]]).drop_duplicates(keep=False) + + +rapidity_range_unclean = [-1.5, 1.5] +rapidity_range_clean = [-0.5, 5] +pT_range = [-0.5, 3] + +pT_vs_rapidity(dfs_orig_train, dfs_cut_train, difference_df(dfs_orig_train, dfs_cut_train, cuts1), + 1, rapidity_range_clean, pT_range, output_path, ' train') + +pT_vs_rapidity(dfs_orig_test, dfs_cut_test, difference_df(dfs_orig_test, dfs_cut_test, cuts1), + 1, rapidity_range_clean, pT_range, output_path, ' test') + +pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfs_orig_d, dfs_cut_d, cuts1), + 1, rapidity_range_clean, pT_range, output_path, ' deploy') + + + +pT_vs_rapidity(dfb_orig_train, dfb_cut_train, difference_df(dfb_orig_train, dfb_cut_train, cuts1), + 0, rapidity_range_clean, pT_range, output_path, ' train') + +pT_vs_rapidity(dfb_orig_test, dfb_cut_test, difference_df(dfb_orig_test, dfb_cut_test, cuts1), + 0, rapidity_range_clean, pT_range, output_path, ' test') + +pT_vs_rapidity(dfs_orig_d, dfs_cut_d, difference_df(dfb_orig_d, dfb_cut_d, cuts1), + 0, rapidity_range_clean, pT_range, output_path, ' deploy') diff --git a/distributions/pt_rapidity.py b/distributions/pt_rapidity.py new file mode 100644 index 0000000..43f6544 --- /dev/null +++ b/distributions/pt_rapidity.py @@ -0,0 +1,106 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + + +from matplotlib.font_manager import FontProperties + +from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, + AutoMinorLocator) + +import gc +import matplotlib as mpl + + + +mpl.rc('figure', max_open_warning = 0) + + + +def pT_vs_rapidity(df_orig, df_cut, difference, sign, x_range, y_range, output_path, data_name): + fig, axs = plt.subplots(1,3, figsize=(15, 4), gridspec_kw={'width_ratios': [1, 1, 1]}) + + + if sign ==0: + s_label = 'Background' + m = 5 + + if sign==1: + s_label = 'Signal' + m = 1 + + axs[0].set_aspect(aspect = 'auto') + axs[1].set_aspect(aspect = 'auto') + axs[2].set_aspect(aspect = 'auto') + + rej = round((1 - (df_cut.shape[0] / df_orig.shape[0])) * 100, 5) + diff = df_orig.shape[0] - df_cut.shape[0] + axs[0].legend(shadow=True, title =str(len(df_orig))+' samples', fontsize =14) + axs[1].legend(shadow=True, title =str(len(df_cut))+' samples', fontsize =14) + axs[2].legend(shadow=True, title ='ML cut rejects \n'+ str(rej) +'% of '+ s_label + + '\n ' + str(diff)+ ' samples were rejected ', + fontsize =14) + + counts0, xedges0, yedges0, im0 = axs[0].hist2d(df_orig['rapidity'], df_orig['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[0].set_title(s_label + ' candidates before ML cut '+data_name, fontsize = 16) + axs[0].set_xlabel('rapidity', fontsize=15) + axs[0].set_ylabel('pT, GeV', fontsize=15) + + + mpl.pyplot.colorbar(im0, ax = axs[0]) + + + + axs[0].xaxis.set_major_locator(MultipleLocator(1)) + axs[0].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[0].xaxis.set_tick_params(which='both', width=2) + + + fig.tight_layout() + + + counts1, xedges1, yedges1, im1 = axs[1].hist2d(df_cut['rapidity'], df_cut['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[1].set_title(s_label + ' candidates after ML cut '+data_name, fontsize = 16) + axs[1].set_xlabel('rapidity', fontsize=15) + axs[1].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[1]) + + + + + + axs[1].xaxis.set_major_locator(MultipleLocator(1)) + axs[1].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[1].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + + counts2, xedges2, yedges2, im2 = axs[2].hist2d(difference['rapidity'], difference['pT'] , range = [x_range, y_range], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.rainbow) + + axs[2].set_title(s_label + ' difference ', fontsize = 16) + axs[2].set_xlabel('rapidity', fontsize=15) + axs[2].set_ylabel('pT, GeV', fontsize=15) + + mpl.pyplot.colorbar(im1, ax = axs[2]) + + + + + + axs[2].xaxis.set_major_locator(MultipleLocator(1)) + axs[2].xaxis.set_major_formatter(FormatStrFormatter('%d')) + + axs[2].xaxis.set_tick_params(which='both', width=2) + + fig.tight_layout() + + fig.savefig(output_path+'/pT_rapidity_'+s_label+'_ML_cut_'+data_name+'.png') diff --git a/distributions/twoD_mass_pT.py b/distributions/twoD_mass_pT.py new file mode 100644 index 0000000..93dc79b --- /dev/null +++ b/distributions/twoD_mass_pT.py @@ -0,0 +1,51 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sn +from matplotlib.backends.backend_pdf import PdfPages +import matplotlib as mpl + +def plot2D(df, sample, sgn,x_axis_value, y_axis_value, range_x, range_y, pdf_key): + """ + + Plots 2-D histogram. + + x_axis_value: e.g. 'mass' + y_axis_value: e.g. 'distance' + range_x: e.g. [1, 1.177] + range_y: e.g [0, 100] + folder: e.g. 'folder' + + """ + + fig, axs = plt.subplots(figsize=(15, 10)) + cax = plt.hist2d(df[x_axis_value],df[y_axis_value],range=[range_x, range_y], bins=100, + norm=mpl.colors.LogNorm(), cmap=plt.cm.viridis) + + + if x_axis_value=='mass': + unit = r' $, \frac{GeV}{c^2}$ ' + plt.vlines(x=1.115683,ymin=range_y[0],ymax=range_y[1], color='r', linestyle='-') + + if x_axis_value=='pT': + unit = r' $, \frac{GeV}{c}$' + + + if sgn==1: + plt.title('Signal candidates ' + sample, fontsize = 25) + + if sgn==0: + plt.title('Background candidates' + sample, fontsize = 25) + + + plt.xlabel(x_axis_value+unit, fontsize=25) + plt.ylabel(y_axis_value, fontsize=25) + + + + mpl.pyplot.colorbar() + + plt.legend(shadow=True,title =str(len(df))+ " samples") + + fig.tight_layout() + plt.savefig(pdf_key,format='pdf') diff --git a/distributions/var1Dcorr.py b/distributions/var1Dcorr.py new file mode 100644 index 0000000..1724b1a --- /dev/null +++ b/distributions/var1Dcorr.py @@ -0,0 +1,43 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +def vars(df, var_to_corr): + vars_to_draw = list(df) + vars_to_draw.remove(var_to_corr) + return vars_to_draw + + +def calculate_correlation(df, vars_to_corr, target_var) : + + from scipy.stats import sem + + mean = df[target_var].mean() + sigma = df[target_var].std() + + correlation = [] + error = [] + + for j in vars_to_corr : + mean_j = df[j].mean() + sigma_j = df[j].std() + + cov = (df[j] - mean_j) * (df[target_var] - mean) / (sigma*sigma_j) + correlation.append(cov.mean()) + error.append(sem(cov)) + + return correlation, error + + +def plot1Dcorrelation(vars_to_draw,var_to_corr, corr_signal, corr_signal_errors, corr_bg, corr_bg_errors, output_path): + fig, ax = plt.subplots(figsize=(20,10)) + plt.errorbar(vars_to_draw, corr_signal, yerr=corr_signal_errors, fmt='') + plt.errorbar(vars_to_draw, corr_bg, yerr=corr_bg_errors, fmt='') + ax.grid(zorder=0) + ax.set_xticklabels(vars_to_draw, fontsize=25, rotation =70) + ax.set_yticklabels([-0.5,-0.4, -0.2,0, -0.2, 0.4], fontsize=25) + plt.legend(('signal','background'), fontsize = 25) + plt.title('Correlation of all variables with '+ var_to_corr+' along with SEM', fontsize = 25) + plt.ylabel('Correlation coefficient', fontsize = 25) + fig.tight_layout() + fig.savefig(output_path+'/all_vars_corr-'+ var_to_corr+'.png') diff --git a/distributions/var_distr.py b/distributions/var_distr.py new file mode 100644 index 0000000..931e995 --- /dev/null +++ b/distributions/var_distr.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# coding: utf-8 + + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import gc +from matplotlib.backends.backend_pdf import PdfPages +from matplotlib.font_manager import FontProperties + +import matplotlib as mpl + + +mpl.rc('figure', max_open_warning = 0) + +def hist_variables(sample, dfs_orig, dfb_orig, dfs_cut, dfb_cut, difference_s,feature, pdf_key): + """ + Applied quality cuts and created distributions for all the features in pdf + file + Parameters + ---------- + df_s: dataframe + signal + df_b: dataframe + background + feature: str + name of the feature to be plotted + pdf_key: PdfPages object + name of pdf document with distributions + """ + + + fig, ax = plt.subplots(3, figsize=(20, 10)) + + + fontP = FontProperties() + fontP.set_size('xx-large') + + ax[0].hist(dfs_orig[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[0].hist(dfb_orig[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[0].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_orig)/len(dfb_orig), 3)) + + + '\n S samples: '+str(dfs_orig.shape[0]) + '\n B samples: '+ str(dfb_orig.shape[0]) + + '\nquality cuts ', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + ax[0].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[0].xaxis.set_tick_params(labelsize=15) + ax[0].yaxis.set_tick_params(labelsize=15) + + ax[0].set_title(str(feature) + ' MC '+ sample, fontsize = 25) + ax[0].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[0].set_yscale('log') + + fig.tight_layout() + + + ax[1].hist(dfs_cut[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[1].hist(dfb_cut[feature], label = 'background', bins = 500, alpha = 0.4, color = 'red') + ax[1].legend(shadow=True,title = 'S/B='+ str(round(len(dfs_cut)/len(dfb_cut), 3)) + + '\n S samples: '+str(dfs_cut.shape[0]) + '\n B samples: '+ str(dfb_cut.shape[0]) + + '\nquality cuts + ML cut', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[1].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[1].xaxis.set_tick_params(labelsize=15) + ax[1].yaxis.set_tick_params(labelsize=15) + + ax[1].set_title(feature + ' MC '+ sample, fontsize = 25) + ax[1].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[1].set_yscale('log') + + fig.tight_layout() + + + + + ax[2].hist(difference_s[feature], label = 'signal', bins = 500, alpha = 0.4, color = 'blue') + ax[2].legend(shadow=True,title ='S samples: '+str(len(difference_s)) +'\nsignal difference', + title_fontsize=15, fontsize =15, bbox_to_anchor=(1.05, 1), + loc='upper left', prop=fontP,) + + + ax[2].set_xlim(dfb_orig[feature].min(), dfb_orig[feature].max()) + + ax[2].xaxis.set_tick_params(labelsize=15) + ax[2].yaxis.set_tick_params(labelsize=15) + + ax[2].set_title(feature + ' MC '+ sample, fontsize = 25) + ax[2].set_xlabel(feature, fontsize = 25) + + if feature!='mass': + ax[2].set_yscale('log') + + fig.tight_layout() + + + + plt.savefig(pdf_key,format='pdf') diff --git a/library/CBM_ML/plot_tools.py b/library/CBM_ML/plot_tools.py index 5996144..db02148 100644 --- a/library/CBM_ML/plot_tools.py +++ b/library/CBM_ML/plot_tools.py @@ -13,7 +13,7 @@ To find the best threshold which results more signal to background ratio for lambda candidates we use the parameter S0 called the approximate median significance by the higgs boson ML challenge (http://higgsml.lal.in2p3.fr/documentation,9.) """ -def AMS(y_true, y_predict, y_true1, y_predict1): +def AMS(y_true, y_predict, y_true1, y_predict1, output_path): roc_auc=roc_auc_score(y_true, y_predict) fpr, tpr, thresholds = roc_curve(y_true, y_predict,drop_intermediate=False ,pos_label=1) S0 = sqrt(2 * ((tpr + fpr) * log((1 + tpr/fpr)) - tpr)) @@ -43,7 +43,7 @@ def AMS(y_true, y_predict, y_true1, y_predict1): plt.ylim([0, 1.02]) #axs.axis([-0.01, 1, 0.9, 1]) fig.tight_layout() - fig.savefig('hists.png') + fig.savefig(str(output_path)+'/hists.png') plt.show() return S0_best_threshold, S0_best_threshold1 @@ -51,11 +51,11 @@ def AMS(y_true, y_predict, y_true1, y_predict1): """ To visualize true MC signal in the probability distribution returned by XGB classifier for a train-test data-set, the preds_prob function can be used. -Its input are a data-frame, predictions of the classifier (probabilities) and the target in the data-frame, and shows how the True signal is present +Its input are a data-frame, predictions of the classifier (probabilities) and the target in the data-frame, and shows how the True signal is present inside this probability. """ -def preds_prob(df, preds, true, dataset): +def preds_prob(df, preds, true, dataset, output_path): if dataset =='train': label1 = 'XGB Predictions on the training data set' else: @@ -70,15 +70,15 @@ def preds_prob(df, preds, true, dataset): err = np.sqrt(hist) center = (bins[:-1] + bins[1:]) / 2 - + hist1, bins1 = np.histogram(TN[preds], bins=bins1) err1 = np.sqrt(hist1) plt.errorbar(center, hist1, yerr=err1, fmt='o', c='Red', label='Background in predictions') - + plt.errorbar(center, hist, yerr=err, fmt='o', c='blue', label='Signal in predictions') - + ax.set_yscale('log') plt.xlabel('Probability',fontsize=18) plt.ylabel('Counts', fontsize=18) @@ -88,18 +88,18 @@ def preds_prob(df, preds, true, dataset): ax.tick_params(axis='both', which='minor', labelsize=16) plt.show() fig.tight_layout() - fig.savefig('test_best.png') + fig.savefig(str(output_path)+'/test_best_pred.png') + - """ A **Confusion Matrix** $C$ is such that $C_{ij}$ is equal to the number of observations known to be in group $i$ and predicted to be in group $j$. Thus in binary classification, the count of true positives is $C_{00}$, false negatives $C_{01}$,false positives is $C_{10}$, and true neagtives is $C_{11}$. If $ y^{'}_{i} $ is the predicted value of the $ i$-th sample and $y_{i}$ is the corresponding true value, then the fraction of correct predictions over -$ n_{samples}$ is defined as +$ n_{samples}$ is defined as $$ True \: positives (y,y^{'}) = \sum_{i=1}^{n_{samples} } 1 (y^{'}_{i} = y_{i}=1) -$$ +$$ The following function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ @@ -159,7 +159,7 @@ def comaprison_XGB_KFPF(XGB_mass,KFPF_mass): hist1, bin_edges1 = np.histogram(XGB_mass,range=(1.09, 1.17), bins=300) hist2, bin_edges2 = np.histogram(KFPF_mass,range=(1.09, 1.17), bins=300) - #makes sense to have only positive values + #makes sense to have only positive values diff = (hist1 - hist2) axs[1].bar(bins[:-1], # this is what makes it comparable ns / ns1, # maybe check for div-by-zero! @@ -174,9 +174,9 @@ def comaprison_XGB_KFPF(XGB_mass,KFPF_mass): """ -Function that plots signal and background in the train-test data set +Function that plots signal and background in the train-test data set """ -def plt_sig_back(df): +def plt_sig_back(df, output_path): range1 = (1.077, 1.18) fig, axs = plt.subplots(figsize=(10, 6)) #df_scaled['mass'].plot.hist(bins = 300, range=range1,grid=True,sharey=True) @@ -193,39 +193,39 @@ def plt_sig_back(df): axs.tick_params(axis='both', which='major', labelsize=18) plt.yscale("log") fig.tight_layout() - fig.savefig("hists.png") - + fig.savefig(str(output_path)+"/hists.png") + + - # The following function will display the inavriant mass histogram of the original 10k event set along with the mass histoigram after we apply a cut # on the probability prediction of xgb -def cut_visualization(df, variable,cut, range1=(1.09, 1.19), bins1= 300 ): +def cut_visualization(df, variable,cut, output_path, range1=(1.09, 1.19), bins1= 300): mask1 = df[variable]>cut df3=df[mask1] - + fig, ax2 = plt.subplots(figsize=(12, 8), dpi = 300) color = 'tab:blue' ax2.hist(df['mass'],bins = bins1, range=range1, facecolor='blue' ,alpha = 0.35, label='before selection') - ax2.set_ylabel('Counts', fontsize = 15, color=color) + ax2.set_ylabel('Counts', fontsize = 8, color=color) ax2.tick_params(axis='y', labelcolor=color) - ax2.legend( fontsize = 15, loc='upper left') - ax2.tick_params(axis='both', which='major', labelsize=15) + ax2.legend( fontsize = 8, loc='upper left') + ax2.tick_params(axis='both', which='major', labelsize=8) ax2.grid() - ax2.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 18) - - - + ax2.set_xlabel("Mass (GeV/${c^2}$)", fontsize = 8) + + + color = 'tab:red' ax1 = ax2.twinx() ax1.hist(df3['mass'], bins = bins1, range=range1, facecolor='red',alpha = 0.35, label="XGB (with a cut > %.2f"%cut+')') - ax1.set_xlabel('Mass in GeV', fontsize = 15) - ax1.set_ylabel('Counts ', fontsize = 15, color=color) + ax1.set_xlabel('Mass in GeV', fontsize = 8) + ax1.set_ylabel('Counts ', fontsize = 8, color=color) ax1.tick_params(axis='y', labelcolor=color) - ax1.tick_params(axis='both', which='major', labelsize=15) - ax1.legend( fontsize = 18,loc='upper right' ) + ax1.tick_params(axis='both', which='major', labelsize=8) + ax1.legend( fontsize = 8,loc='upper right' ) - plt.title("The original sample's Invariant Mass along with mass after selection of XGB", fontsize = 15) + plt.title("The original sample's Invariant Mass along with mass after selection of XGB", fontsize = 10) #plt.text(0.02, 0.1, r'cut > %.4f'%cut, fontsize=15) plt.show() fig.tight_layout() - fig.savefig("test_best.png") + fig.savefig(str(output_path)+"/test_best_cut_inv_mass.png") diff --git a/library/CBM_ML/tree_importer.py b/library/CBM_ML/tree_importer.py index a159ee3..7a57414 100644 --- a/library/CBM_ML/tree_importer.py +++ b/library/CBM_ML/tree_importer.py @@ -7,13 +7,13 @@ """ The tree_importer function takes in flat analysis tree and returns a pandas data-frame object. It has 3 inputs, the first one is path of the analysis tree, the second one as the tree name and third one the number of CPU cores. The first and second input -should be inserted as strings i.e. path inside a single quotation '' or double quotations "". The third input should be a number. +should be inserted as strings i.e. path inside a single quotation '' or double quotations "". The third input should be a number. For example tree_importer("/home/flat_trees/a.tree","PlainTree",4) """ -def tree_importer(path,treename, n): +def tree_importer(path,treename, n): #The number of parallel processors executor = ThreadPoolExecutor(n) - + #To open the root file and convert it to a pandas dataframe file = uproot.open(path+':'+treename, library='pd', decompression_executor=executor, interpretation_executor=executor).arrays(library='np',decompression_executor=executor, @@ -22,72 +22,38 @@ def tree_importer(path,treename, n): return df -""" -The quality_cuts_plus_other_cuts function applies quality selection criteria with other selection criteria to reduce data size -""" -def quality_cuts_plus_other_cuts_lambda(): - #The following quality selection criteria is applied - mass_cut = '('+labels[10]+' > 1.07) &' - - coordinate_cut = '('+labels[20]+'>-50) & ('+labels[20]+'<50) & ('+labels[21]+'>-50) & ('+labels[21]+'<50) & ('+labels[22]+'>-1) & ('+labels[22]+'<80) &' - - chi_2_positive_cut ='('+labels[0]+'>0) & ('+labels[8]+'>0) & ('+labels[2]+'>0) & ('+labels[1]+' > 0) &' - - distance_cut = '('+labels[4]+'>0) & ('+labels[5]+'<80) & ('+labels[3]+'>0) & ('+labels[3]+'<100) &' - - pz_cut = '('+labels[14]+'>0) & ' - #Other cuts - pseudo_rapidity_cut_based_on_acceptance = '('+labels[19]+'>1) & ('+labels[19]+'<6.5) &' - - angular_cut = '('+labels[6]+'>0.1) & ('+labels[7]+'>0.1) &' - - data_reducing_cut = '('+labels[10]+'< 1.3) & ('+labels[15]+'<20) & ('+labels[0]+' < 1000) & ('+labels[2]+'<1e6) & ('+labels[1]+ '< 3e7) & ('+labels[4]+'<5000) & ('+labels[8]+'< 100000)' - - cuts= mass_cut+coordinate_cut+chi_2_positive_cut+distance_cut+pz_cut+pseudo_rapidity_cut_based_on_acceptance+angular_cut+data_reducing_cut - return cuts +def new_labels(df): + new_labels= ['chi2geo', 'chi2primneg', 'chi2primpos', 'chi2topo', 'distance', 'ldl', + 'mass','pT', 'issignal'] -""" -Selects the labels only required for lambda analysis -""" - -def labels_lambda(file): - find_labels = ['chi2_geo','prim_first','prim_second', 'distance','_dl','_l','cosine_first','cosine_second','chi2_topo','cosine_topo','_mass','_pT','_px','_py','_pz','_p','_phi','_rapidity','_pid','_eta','_x','_y','_z','_generation'] - labels = [] - for a in find_labels: - for s in file.keys(): - if a in s: - if 'err' not in s: - if s not in labels: - labels.append(s) - return labels + df.columns = new_labels + return df -""" -This tree_importer_with_cuts imports tree and also applies quality selection criteria on the data along with some further data reducing cuts. -""" +def quality_cuts(df): -def tree_importer_with_cuts(path,treename, n): - - #This part changes the labels of the root tree's branches - - new_labels=['chi2geo', 'chi2primneg','chi2primpos', 'distance', 'ldl','mass', 'pT', 'rapidity','issignal'] - - #The number of parallel processors - executor = ThreadPoolExecutor(n) - - #To open the - file = uproot.open(path+':'+treename, library='pd', decompression_executor=executor, - interpretation_executor=executor) - labels = labels_lambda(file) - cuts = quality_cuts_plus_other_cuts_lambda(labels) - select_labels = [labels[0],labels[1],labels[2],labels[3],labels[4],labels[10],labels[11],labels[18],labels[23]] + """ + All the numerical artifacts ( $\chi$2 < 0), inf and nan were deleted. Also applied + quality cuts based on detector geometry. Full description could be found in + https://docs.google.com/document/d/11f0ZKPW8ftTVhTxeWiog1g6qdsGgN1mlIE3vd5FHLbc/edit?usp=sharing + Parameters + ------------------ + df: dataframe + dataframe to be cleaned + """ - np_arrays = file.arrays(select_labels, cuts, library='np',decompression_executor=executor, - interpretation_executor=executor) - df= pd.DataFrame(data=np_arrays) - df.columns = new_labels - #df['issignal']=((df['issignal']>0)*1) with pd.option_context('mode.use_inf_as_na', True): df = df.dropna() - return df + df = df.dropna() + + chi2_cut = (df['chi2geo'] > 0) & (df['chi2primpos'] > 0) & (df['chi2primneg'] > 0) &\ + (df['chi2topo'] > 0) + mass_cut = (df['mass'] > 1.077) + + # coord_cut = (abs(df['x']) < 50) & (abs(df['y']) < 50) + dist_l_cut = (df['distance'] > 0) & (df['distance'] < 100) & (df['ldl'] > 0 ) + + + cuts = (chi2_cut) & (mass_cut) & (dist_l_cut) + return df[cuts]