GauravPandeyLab · aviadsusman · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,4 @@ __pycache__
 docs/build
 .coverage
 poetry.lock
-coverage.xml
+coverage.xml
diff --git a/README.rst b/README.rst
@@ -1,22 +1,24 @@
-|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_
+|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License|
 
 .. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg
-.. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml
+  :target:  https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml
 
 .. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8 
-.. _Coverage: https://codecov.io/gh/GauravPandeyLab/eipy
+  :target: https://codecov.io/gh/GauravPandeyLab/eipy
 
 .. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest
-.. _ReadTheDocs: https://eipy.readthedocs.io/en/latest/
+  :target: https://eipy.readthedocs.io/en/latest/
+
+.. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration
+  :target: https://pypi.org/project/ensemble-integration/
 
 .. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue
-.. _PythonVersion: https://github.com/GauravPandeyLab/eipy
 
 .. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
-.. _Black: https://github.com/psf/black
+  :target: https://github.com/psf/black
 
 .. |License| image:: https://img.shields.io/badge/License-GPLv3-blue
-.. _License: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING
+  :target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING
 
 
 ``ensemble-integration``: Integrating multi-modal data for predictive modeling

diff --git a/docs/source/development.rst b/docs/source/development.rst
@@ -1,7 +1,7 @@
 Development
 ===========
 
-We welcome contributions to the development of ``eipy``. To contribute follow the below instructions to submit a pull request:
+We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request:
 
 1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub <https://github.com/GauravPandeyLab/eipy>`__ for supported versions).
 2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website <https://python-poetry.org/docs/>`__.
@@ -49,9 +49,9 @@ Note that new test file names must have the prefix `test_`.
 9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing 
 to the main branch has been disabled.
 
-10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>` 
+10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`__ 
 to iterate version numbers in pyproject.toml automatically. You can publish to 
 PyPI by creating a new `release <https://github.com/GauravPandeyLab/eipy/releases>`__, 
 which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the
 GitHub release tag, which you should manually iterate.  
-Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
+Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
diff --git a/eipy/additional_ensembles.py b/eipy/additional_ensembles.py
@@ -14,7 +14,18 @@
 
 class MeanAggregation(BaseEstimator, ClassifierMixin):
     """
+    Mean Aggregation
+
     Trivially takes the mean of X.
+
+    Attributes
+    ----------
+    classes : array
+        Ordered arrray of unique labels for computing mean.
+    X_ : array of (n_samples, n_features)
+        Base predictor data for computing mean.
+    y_ : array of (n_samples,)
+        True labels of X_.
     """
 
     def __init__(self):
@@ -36,7 +47,18 @@ def predict_proba(self, X):
 
 class MedianAggregation(BaseEstimator, ClassifierMixin):
     """
+    Median Aggregation
+
     Trivially takes the median of X.
+
+    Attributes
+    ----------
+    classes : array
+        Ordered arrray of unique labels for computing mean.
+    X_ : array of (n_samples, n_features)
+        Base predictor data for computing mean.
+    y_ : array of (n_samples,)
+        True labels of X_.
     """
 
     def __init__(self):
@@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin):
     Caruana R. et al. (2006) Getting the most out of ensemble selection.
     In: Sixth International Conference on Data
     Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833.
+
+    Parameters
+    ----------
+    scoring :
+
+    max_ensemble_size : int
+        Maximum number of base models to ensemble.
+    random_state : int
+        For determining a random state.
+    greater_is_better : bool
+        For sorting models by performance with respect to a metric.
+
+    Attributes
+    ----------
+    selected_ensemble : list
+        List of models selected for ensemble.
+    train_performance : list
+        Record of model performances.
+    argbest : bool
+        True if metric of interest is to be maximized. Used for model selection.
+    best : bool
+        True if metric of interest is to be maximized. Used for selecting maximum scorers.
     """
 
     def __init__(

diff --git a/eipy/datasets.py b/eipy/datasets.py
@@ -25,7 +25,7 @@ def load_diabetes():
     """
     zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
     # Get data path
-    data_path = get_data_home()
+    data_path = _get_data_home()
     folder_ext = "diabetes"
     data_ext_path = join(data_path, folder_ext)
     # check data downloaded before
@@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix):
     return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)
 
 
-def get_data_home(data_home=None):
+def _get_data_home(data_home=None):
     """Return the path of the eipy data directory.
 
     This function is referring from scikit-learn.

diff --git a/eipy/ei.py b/eipy/ei.py
@@ -17,21 +17,21 @@
 from joblib import Parallel, delayed
 import warnings
 from eipy.utils import (
-    X_is_dict,
-    X_to_numpy,
-    y_to_numpy,
-    set_predictor_seeds,
-    random_integers,
-    sample,
-    retrieve_X_y,
-    append_modality,
-    safe_predict_proba,
+    _X_is_dict,
+    _X_to_numpy,
+    _y_to_numpy,
+    _set_predictor_seeds,
+    _random_integers,
+    _sample,
+    _retrieve_X_y,
+    _append_modality,
+    _safe_predict_proba,
     dummy_cv,
     bar_format,
 )
 from eipy.metrics import (
-    base_summary,
-    ensemble_summary,
+    _base_summary,
+    _ensemble_summary,
 )
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
@@ -181,7 +181,7 @@ def __init__(
         self.modality_names = []
         self.n_features_per_modality = []
 
-        self.random_numbers_for_samples = random_integers(
+        self.random_numbers_for_samples = _random_integers(
             n_integers=n_samples, seed=self.random_state
         )
         self.feature_names = {}
@@ -210,17 +210,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None):
         \n... for ensemble performance analysis..."""
         )
         #  convert y to a numpy array
-        y = y_to_numpy(y)
+        y = _y_to_numpy(y)
 
         #  check if base_predictors are passed here
         if base_predictors is not None:
             self.base_predictors = base_predictors  # update base predictors
 
         #  set random_states in base_predictors
-        set_predictor_seeds(self.base_predictors, self.random_state)
+        _set_predictor_seeds(self.base_predictors, self.random_state)
 
         #  check data format and train accordingly
-        if X_is_dict(X):
+        if _X_is_dict(X):
             for modality_name, modality in X.items():
                 self._fit_base(
                     X=modality,
@@ -252,12 +252,12 @@ def fit_ensemble(self, ensemble_predictors=None):
         if ensemble_predictors is not None:
             self.ensemble_predictors = ensemble_predictors
 
-        set_predictor_seeds(self.ensemble_predictors, self.random_state)
+        _set_predictor_seeds(self.ensemble_predictors, self.random_state)
 
         y_test_combined = []
 
         for fold_id in range(self.k_outer):
-            _, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
+            _, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
             y_test_combined.extend(y_test)
 
         ensemble_predictions = {}
@@ -270,25 +270,25 @@ def fit_ensemble(self, ensemble_predictors=None):
             y_pred_combined = []
 
             for fold_id in range(self.k_outer):
-                X_train, y_train = retrieve_X_y(
+                X_train, y_train = _retrieve_X_y(
                     labelled_data=self.ensemble_training_data[fold_id]
                 )
-                X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
+                X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
 
                 if self.sampling_aggregation == "mean":
                     X_train = X_train.T.groupby(level=[0, 1]).mean().T
                     X_test = X_test.T.groupby(level=[0, 1]).mean().T
 
                 model.fit(X_train, y_train)
-                y_pred = safe_predict_proba(model, X_test)
+                y_pred = _safe_predict_proba(model, X_test)
                 y_pred_combined.extend(y_pred)
 
             ensemble_predictions[model_name] = y_pred_combined
 
         ensemble_predictions["labels"] = y_test_combined
 
         self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
-        self.ensemble_summary = ensemble_summary(
+        self.ensemble_summary = _ensemble_summary(
             self.ensemble_predictions, self.metrics
         )
 
@@ -298,7 +298,7 @@ def fit_ensemble(self, ensemble_predictors=None):
                 desc="Training final ensemble models",
                 bar_format=bar_format,
             ):
-                X_train, y_train = retrieve_X_y(
+                X_train, y_train = _retrieve_X_y(
                     labelled_data=self.ensemble_training_data_final[0]
                 )
 
@@ -314,7 +314,7 @@ def fit_ensemble(self, ensemble_predictors=None):
 
     def predict(self, X_dict, ensemble_model_key):
         """
-        Predict class labels for samples in X
+        Predict class labels for samples in X.
 
         Parameters
         ----------
@@ -336,7 +336,7 @@ def predict(self, X_dict, ensemble_model_key):
             modality_name = self.modality_names[i]
             X = X_dict[modality_name]
 
-            X, _ = X_to_numpy(X)
+            X, _ = _X_to_numpy(X)
 
             base_models = copy.deepcopy(self.final_models["base models"][modality_name])
             self.base_predictors = {}
@@ -345,15 +345,15 @@ def predict(self, X_dict, ensemble_model_key):
                     self.base_predictors[base_model_dict["model name"]] = 0
 
                 base_model = pickle.loads(base_model_dict["pickled model"])
-                y_pred = safe_predict_proba(base_model, X)
+                y_pred = _safe_predict_proba(base_model, X)
 
                 base_model_dict["fold id"] = 0
                 base_model_dict["y_pred"] = y_pred
 
             combined_predictions = self._combine_predictions_outer(
                 base_models, modality_name, model_building=True
             )
-            ensemble_prediction_data = append_modality(
+            ensemble_prediction_data = _append_modality(
                 ensemble_prediction_data, combined_predictions, model_building=True
             )
         ensemble_prediction_data = ensemble_prediction_data[0]
@@ -367,12 +367,12 @@ def predict(self, X_dict, ensemble_model_key):
             self.final_models["ensemble models"][ensemble_model_key]
         )
 
-        y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
+        y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data)
         return y_pred
 
     @ignore_warnings(category=ConvergenceWarning)
     def _fit_base(self, X, y, base_predictors=None, modality_name=None):
-        X, feature_names = X_to_numpy(X)
+        X, feature_names = _X_to_numpy(X)
 
         self.modality_names.append(modality_name)
         self.feature_names[modality_name] = feature_names
@@ -387,7 +387,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
             modality_name=modality_name,
         )
 
-        self.ensemble_training_data = append_modality(
+        self.ensemble_training_data = _append_modality(
             self.ensemble_training_data, ensemble_training_data_modality
         )
 
@@ -399,12 +399,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
             modality_name=modality_name,
         )
 
-        self.ensemble_test_data = append_modality(
+        self.ensemble_test_data = _append_modality(
             self.ensemble_test_data, ensemble_test_data_modality
         )  # append data to dataframe
 
         # create a summary of base predictor performance
-        self.base_summary = base_summary(self.ensemble_test_data, self.metrics)
+        self.base_summary = _base_summary(self.ensemble_test_data, self.metrics)
 
         if self.model_building:
             self._fit_base_final(X=X, y=y, modality_name=modality_name)
@@ -428,7 +428,7 @@ def _fit_base_final(self, X, y, modality_name=None):
             modality_name=modality_name,
         )
 
-        self.ensemble_training_data_final = append_modality(
+        self.ensemble_training_data_final = _append_modality(
             self.ensemble_training_data_final, ensemble_training_data_modality
         )
 
@@ -562,7 +562,7 @@ def _train_predict_single_base_predictor(
 
         X_train, X_test = X[train_index], X[test_index]
         y_train, y_test = y[train_index], y[test_index]
-        X_sample, y_sample = sample(
+        X_sample, y_sample = _sample(
             X_train,
             y_train,
             strategy=self.sampling_strategy,
@@ -581,7 +581,7 @@ def _train_predict_single_base_predictor(
             }
 
         else:
-            y_pred = safe_predict_proba(model, X_test)
+            y_pred = _safe_predict_proba(model, X_test)
 
             results_dict = {
                 "model name": model_name,
@@ -677,7 +677,6 @@ def save(self, path=None):
 
         Parameters
         ----------
-
         path : optional, default=None
             Path to save the EnsembleIntegration class object.
         """
@@ -695,7 +694,6 @@ def load(cls, path):
 
         Parameters
         ----------
-
         path : str
             Path to load the EnsembleIntegration class object.
         """