From a544e706eb7fd8fc9b2fdc3ca9a71575605d0b86 Mon Sep 17 00:00:00 2001 From: aviadsusman Date: Mon, 25 Mar 2024 15:08:02 -0400 Subject: [PATCH 01/16] initial commit --- .gitignore | 3 ++- eipy/additional_ensembles.py | 44 ++++++++++++++++++++++++++++++++++++ eipy/ei.py | 4 +--- eipy/metrics.py | 21 +++++++++++++++++ 4 files changed, 68 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 3e474ee..571d657 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__ .venv -docs/build \ No newline at end of file +docs/build +poetry.lock \ No newline at end of file diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py index c62a4a9..198909e 100644 --- a/eipy/additional_ensembles.py +++ b/eipy/additional_ensembles.py @@ -14,7 +14,18 @@ class MeanAggregation(BaseEstimator, ClassifierMixin): """ + Mean Aggregation + Trivially takes the mean of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -36,7 +47,18 @@ def predict_proba(self, X): class MedianAggregation(BaseEstimator, ClassifierMixin): """ + Median Aggregation + Trivially takes the median of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin): Caruana R. et al. (2006) Getting the most out of ensemble selection. In: Sixth International Conference on Data Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833. + + Sort models by score with respect to chosen metric. Select best performer + + Parameters + ---------- + scoring : + + max_ensemble_size : int + Maximum number of base models to ensemble. + random_state : int + For determining a rarndom state + greater_is_better : bool + + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__( diff --git a/eipy/ei.py b/eipy/ei.py index ed03f8d..0f22bee 100755 --- a/eipy/ei.py +++ b/eipy/ei.py @@ -313,7 +313,7 @@ def fit_ensemble(self, ensemble_predictors=None): def predict(self, X_dict, ensemble_model_key): """ - Predict class labels for samples in X + Predict class labels for samples in X. Parameters ---------- @@ -676,7 +676,6 @@ def save(self, path=None): Parameters ---------- - path : optional, default=None Path to save the EnsembleIntegration class object. """ @@ -694,7 +693,6 @@ def load(cls, path): Parameters ---------- - path : str Path to load the EnsembleIntegration class object. """ diff --git a/eipy/metrics.py b/eipy/metrics.py index 3233277..14ffbf3 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -6,6 +6,27 @@ def fmax_score(y_test, y_score, beta=1.0, pos_label=1): + """ + Computes the maximum F-score (the harmonic mean of precision and recall) and the corresponding threshold. + + Parameters + ---------- + y_test : array of shape (n_samples,) + Array of test labels. + y_pred : array of shape (n_samples,) + Array of predicted probabilities on test data. + beta : float + Parameter for weighing precision and recall in F score calculations. + pos_label : bool + Class selection for computing F scores. + + Returns + ------- + fmax_score : float64 + Calculated fmax + threshold_fmax : float64 + Threshold corresponding to returned fmax + """ fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold( y_test, y_score, beta=beta, pos_label=pos_label ) From 37143893905bec8c8e0c09bf22c9bb76118c1a65 Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Mon, 25 Mar 2024 16:19:26 -0400 Subject: [PATCH 02/16] all docstrings added, nonpublic methods indicated --- eipy/additional_ensembles.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py index 198909e..b803264 100644 --- a/eipy/additional_ensembles.py +++ b/eipy/additional_ensembles.py @@ -86,8 +86,6 @@ class CES(BaseEstimator, ClassifierMixin): In: Sixth International Conference on Data Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833. - Sort models by score with respect to chosen metric. Select best performer - Parameters ---------- scoring : @@ -95,18 +93,20 @@ class CES(BaseEstimator, ClassifierMixin): max_ensemble_size : int Maximum number of base models to ensemble. random_state : int - For determining a rarndom state + For determining a random state. greater_is_better : bool - + For sorting models by performance with respect to a metric. Attributes ---------- - classes : array - Ordered arrray of unique labels for computing mean. - X_ : array of (n_samples, n_features) - Base predictor data for computing mean. - y_ : array of (n_samples,) - True labels of X_. + selected_ensemble : list + List of models selected for ensemble. + train_performance : list + Record of model performances. + argbest : bool + True if metric of interest is to be maximized. Used for model selection. + best : bool + True if metric of interest is to be maximized. Used for selecting maximum scorers. """ def __init__( From 6407546ffae2bd9b70b3bab2b7c3b96a3f1664a7 Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Mon, 25 Mar 2024 16:20:13 -0400 Subject: [PATCH 03/16] ready for pr --- eipy/ei.py | 66 +++++++++++++++++++++--------------------- eipy/interpretation.py | 12 ++++---- eipy/metrics.py | 34 +++++++++++----------- eipy/utils.py | 28 +++++++++--------- 4 files changed, 70 insertions(+), 70 deletions(-) diff --git a/eipy/ei.py b/eipy/ei.py index 0f22bee..d769f63 100755 --- a/eipy/ei.py +++ b/eipy/ei.py @@ -16,21 +16,21 @@ from joblib import Parallel, delayed import warnings from eipy.utils import ( - X_is_dict, - X_to_numpy, - y_to_numpy, - set_predictor_seeds, - random_integers, - sample, - retrieve_X_y, - append_modality, - safe_predict_proba, + _X_is_dict, + _X_to_numpy, + _y_to_numpy, + _set_predictor_seeds, + _random_integers, + _sample, + _retrieve_X_y, + _append_modality, + _safe_predict_proba, dummy_cv, bar_format, ) from eipy.metrics import ( - base_summary, - ensemble_summary, + _base_summary, + _ensemble_summary, ) warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -180,7 +180,7 @@ def __init__( self.modality_names = [] self.n_features_per_modality = [] - self.random_numbers_for_samples = random_integers( + self.random_numbers_for_samples = _random_integers( n_integers=n_samples, seed=self.random_state ) self.feature_names = {} @@ -209,17 +209,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None): \n... for ensemble performance analysis...""" ) # convert y to a numpy array - y = y_to_numpy(y) + y = _y_to_numpy(y) # check if base_predictors are passed here if base_predictors is not None: self.base_predictors = base_predictors # update base predictors # set random_states in base_predictors - set_predictor_seeds(self.base_predictors, self.random_state) + _set_predictor_seeds(self.base_predictors, self.random_state) # check data format and train accordingly - if X_is_dict(X): + if _X_is_dict(X): for modality_name, modality in X.items(): self._fit_base( X=modality, @@ -251,12 +251,12 @@ def fit_ensemble(self, ensemble_predictors=None): if ensemble_predictors is not None: self.ensemble_predictors = ensemble_predictors - set_predictor_seeds(self.ensemble_predictors, self.random_state) + _set_predictor_seeds(self.ensemble_predictors, self.random_state) y_test_combined = [] for fold_id in range(self.k_outer): - _, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + _, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) y_test_combined.extend(y_test) ensemble_predictions = {} @@ -269,17 +269,17 @@ def fit_ensemble(self, ensemble_predictors=None): y_pred_combined = [] for fold_id in range(self.k_outer): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data[fold_id] ) - X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) if self.sampling_aggregation == "mean": X_train = X_train.T.groupby(level=[0, 1]).mean().T X_test = X_test.T.groupby(level=[0, 1]).mean().T model.fit(X_train, y_train) - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) y_pred_combined.extend(y_pred) ensemble_predictions[model_name] = y_pred_combined @@ -287,7 +287,7 @@ def fit_ensemble(self, ensemble_predictors=None): ensemble_predictions["labels"] = y_test_combined self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions) - self.ensemble_summary = ensemble_summary( + self.ensemble_summary = _ensemble_summary( self.ensemble_predictions, self.metrics ) @@ -297,7 +297,7 @@ def fit_ensemble(self, ensemble_predictors=None): desc="Training final ensemble models", bar_format=bar_format, ): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data_final[0] ) @@ -335,7 +335,7 @@ def predict(self, X_dict, ensemble_model_key): modality_name = self.modality_names[i] X = X_dict[modality_name] - X, _ = X_to_numpy(X) + X, _ = _X_to_numpy(X) base_models = copy.deepcopy(self.final_models["base models"][modality_name]) self.base_predictors = {} @@ -344,7 +344,7 @@ def predict(self, X_dict, ensemble_model_key): self.base_predictors[base_model_dict['model name']] = 0 base_model = pickle.loads(base_model_dict["pickled model"]) - y_pred = safe_predict_proba(base_model, X) + y_pred = _safe_predict_proba(base_model, X) base_model_dict["fold id"] = 0 base_model_dict["y_pred"] = y_pred @@ -352,7 +352,7 @@ def predict(self, X_dict, ensemble_model_key): combined_predictions = self._combine_predictions_outer( base_models, modality_name, model_building=True ) - ensemble_prediction_data = append_modality( + ensemble_prediction_data = _append_modality( ensemble_prediction_data, combined_predictions, model_building=True ) ensemble_prediction_data = ensemble_prediction_data[0] @@ -366,12 +366,12 @@ def predict(self, X_dict, ensemble_model_key): self.final_models["ensemble models"][ensemble_model_key] ) - y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data) + y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data) return y_pred @ignore_warnings(category=ConvergenceWarning) def _fit_base(self, X, y, base_predictors=None, modality_name=None): - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) self.modality_names.append(modality_name) self.feature_names[modality_name] = feature_names @@ -386,7 +386,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data = append_modality( + self.ensemble_training_data = _append_modality( self.ensemble_training_data, ensemble_training_data_modality ) @@ -398,12 +398,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_test_data = append_modality( + self.ensemble_test_data = _append_modality( self.ensemble_test_data, ensemble_test_data_modality ) # append data to dataframe # create a summary of base predictor performance - self.base_summary = base_summary(self.ensemble_test_data, self.metrics) + self.base_summary = _base_summary(self.ensemble_test_data, self.metrics) if self.model_building: self._fit_base_final(X=X, y=y, modality_name=modality_name) @@ -427,7 +427,7 @@ def _fit_base_final(self, X, y, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data_final = append_modality( + self.ensemble_training_data_final = _append_modality( self.ensemble_training_data_final, ensemble_training_data_modality ) @@ -561,7 +561,7 @@ def _train_predict_single_base_predictor( X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] - X_sample, y_sample = sample( + X_sample, y_sample = _sample( X_train, y_train, strategy=self.sampling_strategy, @@ -580,7 +580,7 @@ def _train_predict_single_base_predictor( } else: - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) results_dict = { "model name": model_name, diff --git a/eipy/interpretation.py b/eipy/interpretation.py index 8b6025a..cb23c99 100644 --- a/eipy/interpretation.py +++ b/eipy/interpretation.py @@ -1,5 +1,5 @@ from sklearn.inspection import permutation_importance -from eipy.utils import X_to_numpy, retrieve_X_y, bar_format, y_to_numpy +from eipy.utils import _X_to_numpy, _retrieve_X_y, bar_format, _y_to_numpy import pandas as pd from tqdm import tqdm import numpy as np @@ -102,7 +102,7 @@ def rank_product_score(self, X_dict, y): ensemble_predictor_keys = self.ensemble_predictor_keys if self.LFR is None: - self.local_feature_rank(X_dict, y_to_numpy(y)) + self.local_feature_rank(X_dict, _y_to_numpy(y)) if self.LMR is None: self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) @@ -151,7 +151,7 @@ def rank_product_score(self, X_dict, y): return self - def local_feature_rank(self, X_dict, y): + def _local_feature_rank(self, X_dict, y): """ Local Feature Ranks (LFRs) for each base predictor @@ -177,7 +177,7 @@ def local_feature_rank(self, X_dict, y): bar_format=bar_format, ): X = X_dict[modality_name] - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) # check feature names were seen during training if len(self.EI.feature_names[modality_name]) > 1: @@ -285,7 +285,7 @@ def local_feature_rank(self, X_dict, y): return self - def local_model_rank(self, ensemble_predictor_keys): + def _local_model_rank(self, ensemble_predictor_keys): """ Local Model Ranks (LMRs) @@ -302,7 +302,7 @@ def local_model_rank(self, ensemble_predictor_keys): """ # load ensemble training data from EI training - ensemble_X_train, ensemble_y_train = retrieve_X_y( + ensemble_X_train, ensemble_y_train = _retrieve_X_y( labelled_data=self.EI.ensemble_training_data_final[0] ) diff --git a/eipy/metrics.py b/eipy/metrics.py index 14ffbf3..469263d 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd import inspect -from eipy.utils import minority_class +from eipy.utils import _minority_class from sklearn.metrics import roc_auc_score, precision_recall_curve @@ -27,13 +27,13 @@ def fmax_score(y_test, y_score, beta=1.0, pos_label=1): threshold_fmax : float64 Threshold corresponding to returned fmax """ - fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold( + fmax_score, _, _, threshold_fmax = _fmax_precision_recall_threshold( y_test, y_score, beta=beta, pos_label=pos_label ) return fmax_score, threshold_fmax -def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): +def _fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): """ Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein Function Prediction. Nature Methods, 10(3), 221-227. @@ -65,7 +65,7 @@ def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): return fmax_score, precision_fmax, recall_fmax, threshold_fmax -def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): +def _try_metric_with_pos_label(y_true, y_pred, metric, pos_label): """ Compute score for a given metric. """ @@ -76,7 +76,7 @@ def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): return score -def scores(y_true, y_pred, metrics): +def _scores(y_true, y_pred, metrics): """ Compute all metrics for a single set of predictions. Returns a dictionary containing metric keys, each paired to a tuple (score, threshold). @@ -86,7 +86,7 @@ def scores(y_true, y_pred, metrics): if metrics is None: metrics = {"fmax (minority)": fmax_score, "auc": roc_auc_score} - pos_label = minority_class(y_true) # gives value 1 or 0 + pos_label = _minority_class(y_true) # gives value 1 or 0 metric_threshold_dict = {} @@ -96,14 +96,14 @@ def scores(y_true, y_pred, metrics): if "y_pred" in inspect.signature(metric).parameters: # calculate metric for target vector with threshold=0.5 metric_threshold_dict[metric_key] = ( - try_metric_with_pos_label( + _try_metric_with_pos_label( y_true, (np.array(y_pred) >= 0.5).astype(int), metric, pos_label ), 0.5, ) # if y_score parameter exists in metric function then y should be probability vector elif "y_score" in inspect.signature(metric).parameters: - metric_results = try_metric_with_pos_label( + metric_results = _try_metric_with_pos_label( y_true, y_pred, metric, pos_label ) if isinstance( @@ -116,7 +116,7 @@ def scores(y_true, y_pred, metrics): return metric_threshold_dict -def scores_matrix(X, labels, metrics): +def _scores_matrix(X, labels, metrics): """ Calculate metrics and threshold (if applicable) for each column (set of predictions) in matrix X @@ -125,7 +125,7 @@ def scores_matrix(X, labels, metrics): scores_dict = {} for column in X.columns: column_temp = X[column] - metrics_per_column = scores(labels, column_temp, metrics) + metrics_per_column = _scores(labels, column_temp, metrics) # metric_names = list(metrics.keys()) for metric_key in metrics_per_column.keys(): if not (metric_key in scores_dict): @@ -136,13 +136,13 @@ def scores_matrix(X, labels, metrics): return scores_dict -def create_metric_threshold_dataframes(X, labels, metrics): +def _create_metric_threshold_dataframes(X, labels, metrics): """ Create a separate dataframe for metrics and thresholds. thresholds_df contains NaN if threshold not applicable. """ - scores_dict = scores_matrix(X, labels, metrics) + scores_dict = _scores_matrix(X, labels, metrics) metrics_df = pd.DataFrame(columns=X.columns) thresholds_df = pd.DataFrame(columns=X.columns) @@ -151,15 +151,15 @@ def create_metric_threshold_dataframes(X, labels, metrics): return metrics_df, thresholds_df -def create_metric_threshold_dict(X, labels, metrics): +def _create_metric_threshold_dict(X, labels, metrics): df_dict = {} - df_dict["metrics"], df_dict["thresholds"] = create_metric_threshold_dataframes( + df_dict["metrics"], df_dict["thresholds"] = _create_metric_threshold_dataframes( X, labels, metrics ) return df_dict -def base_summary(ensemble_test_dataframes, metrics): +def _base_summary(ensemble_test_dataframes, metrics): """ Create a base predictor performance summary by concatenating data across test folds """ @@ -173,10 +173,10 @@ def base_summary(ensemble_test_dataframes, metrics): return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) -def ensemble_summary(ensemble_predictions, metrics): +def _ensemble_summary(ensemble_predictions, metrics): X = ensemble_predictions.drop(["labels"], axis=1) labels = ensemble_predictions["labels"] - return create_metric_threshold_dict(X, labels, metrics) + return _create_metric_threshold_dict(X, labels, metrics) # These two functions are an attempt at maximizing/minimizing any metric diff --git a/eipy/utils.py b/eipy/utils.py index f81bbff..31ba545 100755 --- a/eipy/utils.py +++ b/eipy/utils.py @@ -15,7 +15,7 @@ bar_format = "{desc}: |{bar}|{percentage:3.0f}%" -def minority_class(y_true): +def _minority_class(y_true): if np.bincount(y_true)[0] < np.bincount(y_true)[1]: minority_class = 0 else: @@ -23,7 +23,7 @@ def minority_class(y_true): return minority_class -def set_predictor_seeds(base_predictors, random_state): +def _set_predictor_seeds(base_predictors, random_state): for _, v in base_predictors.items(): if type(v) == Pipeline: est_ = list(v.named_steps)[-1] @@ -33,25 +33,25 @@ def set_predictor_seeds(base_predictors, random_state): v.set_params(**{"random_state": random_state}) -def X_is_dict(X): +def _X_is_dict(X): if isinstance(X, dict): return True else: return False -def X_dict_to_numpy(X_dict): +def _X_dict_to_numpy(X_dict): """ Retrieve feature names and convert arrays to numpy. """ X_dict_numpy = {} feature_names = {} for key, X in X_dict.items(): - X_dict_numpy[key], feature_names[key] = X_to_numpy(X) + X_dict_numpy[key], feature_names[key] = _X_to_numpy(X) return X_dict_numpy, feature_names -def X_to_numpy(X): +def _X_to_numpy(X): """ Return X as a numpy array, with feature names if applicable. """ @@ -66,7 +66,7 @@ def X_to_numpy(X): ) -def y_to_numpy(y): +def _y_to_numpy(y): """ Check y is numpy array and convert if not. """ @@ -85,13 +85,13 @@ def y_to_numpy(y): or pandas Series.""" ) - if not is_binary_array(_y): + if not _is_binary_array(_y): raise ValueError("y must contain binary values.") return _y -def is_binary_array(arr): +def _is_binary_array(arr): if all(x == 0 or x == 1 or x == 0.0 or x == 1.0 for x in arr): return True else: @@ -110,7 +110,7 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits -def safe_predict_proba(model, X): # uses predict_proba method where possible +def _safe_predict_proba(model, X): # uses predict_proba method where possible if hasattr(model, "predict_proba"): y_pred = model.predict_proba(X)[:, 1] else: @@ -118,12 +118,12 @@ def safe_predict_proba(model, X): # uses predict_proba method where possible return y_pred -def random_integers(n_integers=1, seed=42): +def _random_integers(n_integers=1, seed=42): random.seed(seed) return random.sample(range(0, 10000), n_integers) -def sample(X, y, strategy, random_state): +def _sample(X, y, strategy, random_state): if strategy is None: X_resampled, y_resampled = X, y elif strategy == "undersampling": # define sampler @@ -161,13 +161,13 @@ def sample(X, y, strategy, random_state): return X_resampled, y_resampled -def retrieve_X_y(labelled_data): +def _retrieve_X_y(labelled_data): X = labelled_data.drop(columns=["labels"], level=0) y = np.ravel(labelled_data["labels"]) return X, y -def append_modality(current_data, modality_data, model_building=False): +def _append_modality(current_data, modality_data, model_building=False): if current_data is None: combined_dataframe = modality_data else: From b577291d1cfd5c3d752b82610247d9b2bdb83c55 Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 13:57:39 -0400 Subject: [PATCH 04/16] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 5d56f15..06c4924 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_ +|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |Black| |License| .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg .. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml From 1de5bdc363deea09c307b7f3691dbaa2e2418605 Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:02:09 -0400 Subject: [PATCH 05/16] Update development.rst --- docs/source/development.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/development.rst b/docs/source/development.rst index f44941f..2f58f2f 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -49,9 +49,9 @@ Note that new test file names must have the prefix `test_`. 9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing to the main branch has been disabled. -10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning ` +10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning `__ to iterate version numbers in pyproject.toml automatically. You can publish to PyPI by creating a new `release `__, which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the GitHub release tag, which you should manually iterate. -Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow. \ No newline at end of file +Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow. From 09d7ddc438617892a79e6c1de4d13e243e42d8a8 Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:07:05 -0400 Subject: [PATCH 06/16] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 06c4924..2a828fd 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |Black| |License| +_|Tests|_ |Coverage| _ |ReadTheDocs| _ |PythonVersion| _ |Black| _ |License| _ .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg .. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml From abd1e411c9565c7095c6d918a219cb7193b95f02 Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:11:03 -0400 Subject: [PATCH 07/16] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2a828fd..5d56f15 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -_|Tests|_ |Coverage| _ |ReadTheDocs| _ |PythonVersion| _ |Black| _ |License| _ +|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_ .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg .. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml From b7cd4d123c28ff5e77c8df64bb8a5e981047ede8 Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:28:04 -0400 Subject: [PATCH 08/16] Update README.rst --- README.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.rst b/README.rst index 5d56f15..a1ddd14 100644 --- a/README.rst +++ b/README.rst @@ -1,22 +1,22 @@ -|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_ +|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |Black| |License| .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg -.. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml + :target: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml .. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8 -.. _Coverage: https://codecov.io/gh/GauravPandeyLab/eipy + :target: https://codecov.io/gh/GauravPandeyLab/eipy .. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest -.. _ReadTheDocs: https://eipy.readthedocs.io/en/latest/ + :target: https://eipy.readthedocs.io/en/latest/ .. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue -.. _PythonVersion: https://github.com/GauravPandeyLab/eipy + :target: https://pypi.org/project/ensemble-integration/ .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg -.. _Black: https://github.com/psf/black + :target: https://github.com/psf/black .. |License| image:: https://img.shields.io/badge/License-GPLv3-blue -.. _License: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING + :target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING ``ensemble-integration``: Integrating multi-modal data for predictive modeling From 3fbf2f50bd46f8f596263afadcd85332d9d5e85b Mon Sep 17 00:00:00 2001 From: Jamie Bennett <55380591+03bennej@users.noreply.github.com> Date: Wed, 3 Apr 2024 14:57:12 -0400 Subject: [PATCH 09/16] Update README.rst --- README.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index a1ddd14..28a4ec1 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |Black| |License| +|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License| .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg :target: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml @@ -9,9 +9,11 @@ .. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest :target: https://eipy.readthedocs.io/en/latest/ -.. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue +.. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration :target: https://pypi.org/project/ensemble-integration/ +.. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue + .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg :target: https://github.com/psf/black From 2cbb25518e89c8e9af9e048e24835eb9e62845cf Mon Sep 17 00:00:00 2001 From: 03bennej Date: Fri, 5 Apr 2024 10:01:11 -0400 Subject: [PATCH 10/16] minor change --- docs/source/development.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/development.rst b/docs/source/development.rst index f44941f..2addee3 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -1,7 +1,7 @@ Development =========== -We welcome contributions to the development of ``eipy``. To contribute follow the below instructions to submit a pull request: +We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request: 1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub `__ for supported versions). 2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website `__. From 0c77e503e8849601540a11c7d1c38d8d8e92ce7d Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Fri, 5 Apr 2024 10:19:44 -0400 Subject: [PATCH 11/16] fixed method definition formatting --- eipy/datasets.py | 4 ++-- eipy/interpretation.py | 4 ++-- eipy/metrics.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/eipy/datasets.py b/eipy/datasets.py index 7e79315..7bba460 100644 --- a/eipy/datasets.py +++ b/eipy/datasets.py @@ -25,7 +25,7 @@ def load_diabetes(): """ zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1" # Get data path - data_path = get_data_home() + data_path = _get_data_home() folder_ext = "diabetes" data_ext_path = join(data_path, folder_ext) # check data downloaded before @@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix): return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0) -def get_data_home(data_home=None): +def _get_data_home(data_home=None): """Return the path of the eipy data directory. This function is referring from scikit-learn. diff --git a/eipy/interpretation.py b/eipy/interpretation.py index cb23c99..fc65c9c 100644 --- a/eipy/interpretation.py +++ b/eipy/interpretation.py @@ -102,10 +102,10 @@ def rank_product_score(self, X_dict, y): ensemble_predictor_keys = self.ensemble_predictor_keys if self.LFR is None: - self.local_feature_rank(X_dict, _y_to_numpy(y)) + self._local_feature_rank(X_dict, _y_to_numpy(y)) if self.LMR is None: - self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) + self._local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) print("Calculating combined rank product score...") diff --git a/eipy/metrics.py b/eipy/metrics.py index 469263d..74e84d5 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -170,7 +170,7 @@ def _base_summary(ensemble_test_dataframes, metrics): for df in ensemble_test_dataframes ] ) - return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) + return _create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) def _ensemble_summary(ensemble_predictions, metrics): From 57c04e0c2304ca5c27b33014c998a92c7a8c3588 Mon Sep 17 00:00:00 2001 From: aviadsusman Date: Mon, 25 Mar 2024 15:08:02 -0400 Subject: [PATCH 12/16] initial commit --- .gitignore | 4 +--- eipy/additional_ensembles.py | 44 ++++++++++++++++++++++++++++++++++++ eipy/ei.py | 4 +--- eipy/metrics.py | 21 +++++++++++++++++ 4 files changed, 67 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index 9bac2e8..fe4e7e1 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,4 @@ __pycache__ .venv .tox docs/build -.coverage -poetry.lock -coverage.xml +poetry.lock \ No newline at end of file diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py index c62a4a9..198909e 100644 --- a/eipy/additional_ensembles.py +++ b/eipy/additional_ensembles.py @@ -14,7 +14,18 @@ class MeanAggregation(BaseEstimator, ClassifierMixin): """ + Mean Aggregation + Trivially takes the mean of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -36,7 +47,18 @@ def predict_proba(self, X): class MedianAggregation(BaseEstimator, ClassifierMixin): """ + Median Aggregation + Trivially takes the median of X. + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__(self): @@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin): Caruana R. et al. (2006) Getting the most out of ensemble selection. In: Sixth International Conference on Data Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833. + + Sort models by score with respect to chosen metric. Select best performer + + Parameters + ---------- + scoring : + + max_ensemble_size : int + Maximum number of base models to ensemble. + random_state : int + For determining a rarndom state + greater_is_better : bool + + + Attributes + ---------- + classes : array + Ordered arrray of unique labels for computing mean. + X_ : array of (n_samples, n_features) + Base predictor data for computing mean. + y_ : array of (n_samples,) + True labels of X_. """ def __init__( diff --git a/eipy/ei.py b/eipy/ei.py index 8bc13fc..984053a 100755 --- a/eipy/ei.py +++ b/eipy/ei.py @@ -314,7 +314,7 @@ def fit_ensemble(self, ensemble_predictors=None): def predict(self, X_dict, ensemble_model_key): """ - Predict class labels for samples in X + Predict class labels for samples in X. Parameters ---------- @@ -677,7 +677,6 @@ def save(self, path=None): Parameters ---------- - path : optional, default=None Path to save the EnsembleIntegration class object. """ @@ -695,7 +694,6 @@ def load(cls, path): Parameters ---------- - path : str Path to load the EnsembleIntegration class object. """ diff --git a/eipy/metrics.py b/eipy/metrics.py index 3233277..14ffbf3 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -6,6 +6,27 @@ def fmax_score(y_test, y_score, beta=1.0, pos_label=1): + """ + Computes the maximum F-score (the harmonic mean of precision and recall) and the corresponding threshold. + + Parameters + ---------- + y_test : array of shape (n_samples,) + Array of test labels. + y_pred : array of shape (n_samples,) + Array of predicted probabilities on test data. + beta : float + Parameter for weighing precision and recall in F score calculations. + pos_label : bool + Class selection for computing F scores. + + Returns + ------- + fmax_score : float64 + Calculated fmax + threshold_fmax : float64 + Threshold corresponding to returned fmax + """ fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold( y_test, y_score, beta=beta, pos_label=pos_label ) From 1d6a0eac49d420ae4147b28840073686e7e3c0ad Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Mon, 25 Mar 2024 16:19:26 -0400 Subject: [PATCH 13/16] all docstrings added, nonpublic methods indicated --- eipy/additional_ensembles.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py index 198909e..b803264 100644 --- a/eipy/additional_ensembles.py +++ b/eipy/additional_ensembles.py @@ -86,8 +86,6 @@ class CES(BaseEstimator, ClassifierMixin): In: Sixth International Conference on Data Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833. - Sort models by score with respect to chosen metric. Select best performer - Parameters ---------- scoring : @@ -95,18 +93,20 @@ class CES(BaseEstimator, ClassifierMixin): max_ensemble_size : int Maximum number of base models to ensemble. random_state : int - For determining a rarndom state + For determining a random state. greater_is_better : bool - + For sorting models by performance with respect to a metric. Attributes ---------- - classes : array - Ordered arrray of unique labels for computing mean. - X_ : array of (n_samples, n_features) - Base predictor data for computing mean. - y_ : array of (n_samples,) - True labels of X_. + selected_ensemble : list + List of models selected for ensemble. + train_performance : list + Record of model performances. + argbest : bool + True if metric of interest is to be maximized. Used for model selection. + best : bool + True if metric of interest is to be maximized. Used for selecting maximum scorers. """ def __init__( From 388866a5e98287bf84c94975031f0a646dee5549 Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Mon, 25 Mar 2024 16:20:13 -0400 Subject: [PATCH 14/16] ready for pr --- eipy/ei.py | 66 +++++++++++++++++++++--------------------- eipy/interpretation.py | 12 ++++---- eipy/metrics.py | 34 +++++++++++----------- eipy/utils.py | 28 +++++++++--------- 4 files changed, 70 insertions(+), 70 deletions(-) diff --git a/eipy/ei.py b/eipy/ei.py index 984053a..a8b45d6 100755 --- a/eipy/ei.py +++ b/eipy/ei.py @@ -17,21 +17,21 @@ from joblib import Parallel, delayed import warnings from eipy.utils import ( - X_is_dict, - X_to_numpy, - y_to_numpy, - set_predictor_seeds, - random_integers, - sample, - retrieve_X_y, - append_modality, - safe_predict_proba, + _X_is_dict, + _X_to_numpy, + _y_to_numpy, + _set_predictor_seeds, + _random_integers, + _sample, + _retrieve_X_y, + _append_modality, + _safe_predict_proba, dummy_cv, bar_format, ) from eipy.metrics import ( - base_summary, - ensemble_summary, + _base_summary, + _ensemble_summary, ) warnings.filterwarnings("ignore", category=DeprecationWarning) @@ -181,7 +181,7 @@ def __init__( self.modality_names = [] self.n_features_per_modality = [] - self.random_numbers_for_samples = random_integers( + self.random_numbers_for_samples = _random_integers( n_integers=n_samples, seed=self.random_state ) self.feature_names = {} @@ -210,17 +210,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None): \n... for ensemble performance analysis...""" ) # convert y to a numpy array - y = y_to_numpy(y) + y = _y_to_numpy(y) # check if base_predictors are passed here if base_predictors is not None: self.base_predictors = base_predictors # update base predictors # set random_states in base_predictors - set_predictor_seeds(self.base_predictors, self.random_state) + _set_predictor_seeds(self.base_predictors, self.random_state) # check data format and train accordingly - if X_is_dict(X): + if _X_is_dict(X): for modality_name, modality in X.items(): self._fit_base( X=modality, @@ -252,12 +252,12 @@ def fit_ensemble(self, ensemble_predictors=None): if ensemble_predictors is not None: self.ensemble_predictors = ensemble_predictors - set_predictor_seeds(self.ensemble_predictors, self.random_state) + _set_predictor_seeds(self.ensemble_predictors, self.random_state) y_test_combined = [] for fold_id in range(self.k_outer): - _, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + _, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) y_test_combined.extend(y_test) ensemble_predictions = {} @@ -270,17 +270,17 @@ def fit_ensemble(self, ensemble_predictors=None): y_pred_combined = [] for fold_id in range(self.k_outer): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data[fold_id] ) - X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) + X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id]) if self.sampling_aggregation == "mean": X_train = X_train.T.groupby(level=[0, 1]).mean().T X_test = X_test.T.groupby(level=[0, 1]).mean().T model.fit(X_train, y_train) - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) y_pred_combined.extend(y_pred) ensemble_predictions[model_name] = y_pred_combined @@ -288,7 +288,7 @@ def fit_ensemble(self, ensemble_predictors=None): ensemble_predictions["labels"] = y_test_combined self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions) - self.ensemble_summary = ensemble_summary( + self.ensemble_summary = _ensemble_summary( self.ensemble_predictions, self.metrics ) @@ -298,7 +298,7 @@ def fit_ensemble(self, ensemble_predictors=None): desc="Training final ensemble models", bar_format=bar_format, ): - X_train, y_train = retrieve_X_y( + X_train, y_train = _retrieve_X_y( labelled_data=self.ensemble_training_data_final[0] ) @@ -336,7 +336,7 @@ def predict(self, X_dict, ensemble_model_key): modality_name = self.modality_names[i] X = X_dict[modality_name] - X, _ = X_to_numpy(X) + X, _ = _X_to_numpy(X) base_models = copy.deepcopy(self.final_models["base models"][modality_name]) self.base_predictors = {} @@ -345,7 +345,7 @@ def predict(self, X_dict, ensemble_model_key): self.base_predictors[base_model_dict["model name"]] = 0 base_model = pickle.loads(base_model_dict["pickled model"]) - y_pred = safe_predict_proba(base_model, X) + y_pred = _safe_predict_proba(base_model, X) base_model_dict["fold id"] = 0 base_model_dict["y_pred"] = y_pred @@ -353,7 +353,7 @@ def predict(self, X_dict, ensemble_model_key): combined_predictions = self._combine_predictions_outer( base_models, modality_name, model_building=True ) - ensemble_prediction_data = append_modality( + ensemble_prediction_data = _append_modality( ensemble_prediction_data, combined_predictions, model_building=True ) ensemble_prediction_data = ensemble_prediction_data[0] @@ -367,12 +367,12 @@ def predict(self, X_dict, ensemble_model_key): self.final_models["ensemble models"][ensemble_model_key] ) - y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data) + y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data) return y_pred @ignore_warnings(category=ConvergenceWarning) def _fit_base(self, X, y, base_predictors=None, modality_name=None): - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) self.modality_names.append(modality_name) self.feature_names[modality_name] = feature_names @@ -387,7 +387,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data = append_modality( + self.ensemble_training_data = _append_modality( self.ensemble_training_data, ensemble_training_data_modality ) @@ -399,12 +399,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None): modality_name=modality_name, ) - self.ensemble_test_data = append_modality( + self.ensemble_test_data = _append_modality( self.ensemble_test_data, ensemble_test_data_modality ) # append data to dataframe # create a summary of base predictor performance - self.base_summary = base_summary(self.ensemble_test_data, self.metrics) + self.base_summary = _base_summary(self.ensemble_test_data, self.metrics) if self.model_building: self._fit_base_final(X=X, y=y, modality_name=modality_name) @@ -428,7 +428,7 @@ def _fit_base_final(self, X, y, modality_name=None): modality_name=modality_name, ) - self.ensemble_training_data_final = append_modality( + self.ensemble_training_data_final = _append_modality( self.ensemble_training_data_final, ensemble_training_data_modality ) @@ -562,7 +562,7 @@ def _train_predict_single_base_predictor( X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] - X_sample, y_sample = sample( + X_sample, y_sample = _sample( X_train, y_train, strategy=self.sampling_strategy, @@ -581,7 +581,7 @@ def _train_predict_single_base_predictor( } else: - y_pred = safe_predict_proba(model, X_test) + y_pred = _safe_predict_proba(model, X_test) results_dict = { "model name": model_name, diff --git a/eipy/interpretation.py b/eipy/interpretation.py index 8b6025a..cb23c99 100644 --- a/eipy/interpretation.py +++ b/eipy/interpretation.py @@ -1,5 +1,5 @@ from sklearn.inspection import permutation_importance -from eipy.utils import X_to_numpy, retrieve_X_y, bar_format, y_to_numpy +from eipy.utils import _X_to_numpy, _retrieve_X_y, bar_format, _y_to_numpy import pandas as pd from tqdm import tqdm import numpy as np @@ -102,7 +102,7 @@ def rank_product_score(self, X_dict, y): ensemble_predictor_keys = self.ensemble_predictor_keys if self.LFR is None: - self.local_feature_rank(X_dict, y_to_numpy(y)) + self.local_feature_rank(X_dict, _y_to_numpy(y)) if self.LMR is None: self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) @@ -151,7 +151,7 @@ def rank_product_score(self, X_dict, y): return self - def local_feature_rank(self, X_dict, y): + def _local_feature_rank(self, X_dict, y): """ Local Feature Ranks (LFRs) for each base predictor @@ -177,7 +177,7 @@ def local_feature_rank(self, X_dict, y): bar_format=bar_format, ): X = X_dict[modality_name] - X, feature_names = X_to_numpy(X) + X, feature_names = _X_to_numpy(X) # check feature names were seen during training if len(self.EI.feature_names[modality_name]) > 1: @@ -285,7 +285,7 @@ def local_feature_rank(self, X_dict, y): return self - def local_model_rank(self, ensemble_predictor_keys): + def _local_model_rank(self, ensemble_predictor_keys): """ Local Model Ranks (LMRs) @@ -302,7 +302,7 @@ def local_model_rank(self, ensemble_predictor_keys): """ # load ensemble training data from EI training - ensemble_X_train, ensemble_y_train = retrieve_X_y( + ensemble_X_train, ensemble_y_train = _retrieve_X_y( labelled_data=self.EI.ensemble_training_data_final[0] ) diff --git a/eipy/metrics.py b/eipy/metrics.py index 14ffbf3..469263d 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd import inspect -from eipy.utils import minority_class +from eipy.utils import _minority_class from sklearn.metrics import roc_auc_score, precision_recall_curve @@ -27,13 +27,13 @@ def fmax_score(y_test, y_score, beta=1.0, pos_label=1): threshold_fmax : float64 Threshold corresponding to returned fmax """ - fmax_score, _, _, threshold_fmax = fmax_precision_recall_threshold( + fmax_score, _, _, threshold_fmax = _fmax_precision_recall_threshold( y_test, y_score, beta=beta, pos_label=pos_label ) return fmax_score, threshold_fmax -def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): +def _fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): """ Radivojac, P. et al. (2013). A Large-Scale Evaluation of Computational Protein Function Prediction. Nature Methods, 10(3), 221-227. @@ -65,7 +65,7 @@ def fmax_precision_recall_threshold(labels, y_score, beta=1.0, pos_label=1): return fmax_score, precision_fmax, recall_fmax, threshold_fmax -def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): +def _try_metric_with_pos_label(y_true, y_pred, metric, pos_label): """ Compute score for a given metric. """ @@ -76,7 +76,7 @@ def try_metric_with_pos_label(y_true, y_pred, metric, pos_label): return score -def scores(y_true, y_pred, metrics): +def _scores(y_true, y_pred, metrics): """ Compute all metrics for a single set of predictions. Returns a dictionary containing metric keys, each paired to a tuple (score, threshold). @@ -86,7 +86,7 @@ def scores(y_true, y_pred, metrics): if metrics is None: metrics = {"fmax (minority)": fmax_score, "auc": roc_auc_score} - pos_label = minority_class(y_true) # gives value 1 or 0 + pos_label = _minority_class(y_true) # gives value 1 or 0 metric_threshold_dict = {} @@ -96,14 +96,14 @@ def scores(y_true, y_pred, metrics): if "y_pred" in inspect.signature(metric).parameters: # calculate metric for target vector with threshold=0.5 metric_threshold_dict[metric_key] = ( - try_metric_with_pos_label( + _try_metric_with_pos_label( y_true, (np.array(y_pred) >= 0.5).astype(int), metric, pos_label ), 0.5, ) # if y_score parameter exists in metric function then y should be probability vector elif "y_score" in inspect.signature(metric).parameters: - metric_results = try_metric_with_pos_label( + metric_results = _try_metric_with_pos_label( y_true, y_pred, metric, pos_label ) if isinstance( @@ -116,7 +116,7 @@ def scores(y_true, y_pred, metrics): return metric_threshold_dict -def scores_matrix(X, labels, metrics): +def _scores_matrix(X, labels, metrics): """ Calculate metrics and threshold (if applicable) for each column (set of predictions) in matrix X @@ -125,7 +125,7 @@ def scores_matrix(X, labels, metrics): scores_dict = {} for column in X.columns: column_temp = X[column] - metrics_per_column = scores(labels, column_temp, metrics) + metrics_per_column = _scores(labels, column_temp, metrics) # metric_names = list(metrics.keys()) for metric_key in metrics_per_column.keys(): if not (metric_key in scores_dict): @@ -136,13 +136,13 @@ def scores_matrix(X, labels, metrics): return scores_dict -def create_metric_threshold_dataframes(X, labels, metrics): +def _create_metric_threshold_dataframes(X, labels, metrics): """ Create a separate dataframe for metrics and thresholds. thresholds_df contains NaN if threshold not applicable. """ - scores_dict = scores_matrix(X, labels, metrics) + scores_dict = _scores_matrix(X, labels, metrics) metrics_df = pd.DataFrame(columns=X.columns) thresholds_df = pd.DataFrame(columns=X.columns) @@ -151,15 +151,15 @@ def create_metric_threshold_dataframes(X, labels, metrics): return metrics_df, thresholds_df -def create_metric_threshold_dict(X, labels, metrics): +def _create_metric_threshold_dict(X, labels, metrics): df_dict = {} - df_dict["metrics"], df_dict["thresholds"] = create_metric_threshold_dataframes( + df_dict["metrics"], df_dict["thresholds"] = _create_metric_threshold_dataframes( X, labels, metrics ) return df_dict -def base_summary(ensemble_test_dataframes, metrics): +def _base_summary(ensemble_test_dataframes, metrics): """ Create a base predictor performance summary by concatenating data across test folds """ @@ -173,10 +173,10 @@ def base_summary(ensemble_test_dataframes, metrics): return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) -def ensemble_summary(ensemble_predictions, metrics): +def _ensemble_summary(ensemble_predictions, metrics): X = ensemble_predictions.drop(["labels"], axis=1) labels = ensemble_predictions["labels"] - return create_metric_threshold_dict(X, labels, metrics) + return _create_metric_threshold_dict(X, labels, metrics) # These two functions are an attempt at maximizing/minimizing any metric diff --git a/eipy/utils.py b/eipy/utils.py index f81bbff..31ba545 100755 --- a/eipy/utils.py +++ b/eipy/utils.py @@ -15,7 +15,7 @@ bar_format = "{desc}: |{bar}|{percentage:3.0f}%" -def minority_class(y_true): +def _minority_class(y_true): if np.bincount(y_true)[0] < np.bincount(y_true)[1]: minority_class = 0 else: @@ -23,7 +23,7 @@ def minority_class(y_true): return minority_class -def set_predictor_seeds(base_predictors, random_state): +def _set_predictor_seeds(base_predictors, random_state): for _, v in base_predictors.items(): if type(v) == Pipeline: est_ = list(v.named_steps)[-1] @@ -33,25 +33,25 @@ def set_predictor_seeds(base_predictors, random_state): v.set_params(**{"random_state": random_state}) -def X_is_dict(X): +def _X_is_dict(X): if isinstance(X, dict): return True else: return False -def X_dict_to_numpy(X_dict): +def _X_dict_to_numpy(X_dict): """ Retrieve feature names and convert arrays to numpy. """ X_dict_numpy = {} feature_names = {} for key, X in X_dict.items(): - X_dict_numpy[key], feature_names[key] = X_to_numpy(X) + X_dict_numpy[key], feature_names[key] = _X_to_numpy(X) return X_dict_numpy, feature_names -def X_to_numpy(X): +def _X_to_numpy(X): """ Return X as a numpy array, with feature names if applicable. """ @@ -66,7 +66,7 @@ def X_to_numpy(X): ) -def y_to_numpy(y): +def _y_to_numpy(y): """ Check y is numpy array and convert if not. """ @@ -85,13 +85,13 @@ def y_to_numpy(y): or pandas Series.""" ) - if not is_binary_array(_y): + if not _is_binary_array(_y): raise ValueError("y must contain binary values.") return _y -def is_binary_array(arr): +def _is_binary_array(arr): if all(x == 0 or x == 1 or x == 0.0 or x == 1.0 for x in arr): return True else: @@ -110,7 +110,7 @@ def get_n_splits(self, X, y, groups=None): return self.n_splits -def safe_predict_proba(model, X): # uses predict_proba method where possible +def _safe_predict_proba(model, X): # uses predict_proba method where possible if hasattr(model, "predict_proba"): y_pred = model.predict_proba(X)[:, 1] else: @@ -118,12 +118,12 @@ def safe_predict_proba(model, X): # uses predict_proba method where possible return y_pred -def random_integers(n_integers=1, seed=42): +def _random_integers(n_integers=1, seed=42): random.seed(seed) return random.sample(range(0, 10000), n_integers) -def sample(X, y, strategy, random_state): +def _sample(X, y, strategy, random_state): if strategy is None: X_resampled, y_resampled = X, y elif strategy == "undersampling": # define sampler @@ -161,13 +161,13 @@ def sample(X, y, strategy, random_state): return X_resampled, y_resampled -def retrieve_X_y(labelled_data): +def _retrieve_X_y(labelled_data): X = labelled_data.drop(columns=["labels"], level=0) y = np.ravel(labelled_data["labels"]) return X, y -def append_modality(current_data, modality_data, model_building=False): +def _append_modality(current_data, modality_data, model_building=False): if current_data is None: combined_dataframe = modality_data else: From 656e608b3a0599de68cbf441e43d18975a8320ce Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Fri, 5 Apr 2024 10:19:44 -0400 Subject: [PATCH 15/16] fixed method definition formatting --- eipy/datasets.py | 4 ++-- eipy/interpretation.py | 4 ++-- eipy/metrics.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/eipy/datasets.py b/eipy/datasets.py index 7e79315..7bba460 100644 --- a/eipy/datasets.py +++ b/eipy/datasets.py @@ -25,7 +25,7 @@ def load_diabetes(): """ zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1" # Get data path - data_path = get_data_home() + data_path = _get_data_home() folder_ext = "diabetes" data_ext_path = join(data_path, folder_ext) # check data downloaded before @@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix): return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0) -def get_data_home(data_home=None): +def _get_data_home(data_home=None): """Return the path of the eipy data directory. This function is referring from scikit-learn. diff --git a/eipy/interpretation.py b/eipy/interpretation.py index cb23c99..fc65c9c 100644 --- a/eipy/interpretation.py +++ b/eipy/interpretation.py @@ -102,10 +102,10 @@ def rank_product_score(self, X_dict, y): ensemble_predictor_keys = self.ensemble_predictor_keys if self.LFR is None: - self.local_feature_rank(X_dict, _y_to_numpy(y)) + self._local_feature_rank(X_dict, _y_to_numpy(y)) if self.LMR is None: - self.local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) + self._local_model_rank(ensemble_predictor_keys=ensemble_predictor_keys) print("Calculating combined rank product score...") diff --git a/eipy/metrics.py b/eipy/metrics.py index 469263d..74e84d5 100644 --- a/eipy/metrics.py +++ b/eipy/metrics.py @@ -170,7 +170,7 @@ def _base_summary(ensemble_test_dataframes, metrics): for df in ensemble_test_dataframes ] ) - return create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) + return _create_metric_threshold_dict(ensemble_test_averaged_samples, labels, metrics) def _ensemble_summary(ensemble_predictions, metrics): From 8c1b283db3865dfc1c2e539ce0a4e3a0ecf5b973 Mon Sep 17 00:00:00 2001 From: Aviad Susman Date: Fri, 5 Apr 2024 12:58:35 -0400 Subject: [PATCH 16/16] .gitignore changes --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6ad0135..cdca365 100644 --- a/.gitignore +++ b/.gitignore @@ -2,5 +2,6 @@ __pycache__ .venv .tox docs/build +.coverage poetry.lock coverage.xml \ No newline at end of file