Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ __pycache__
docs/build
.coverage
poetry.lock
coverage.xml
coverage.xml
16 changes: 9 additions & 7 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
|Tests|_ |Coverage|_ |ReadTheDocs|_ |PythonVersion|_ |Black|_ |License|_
|Tests| |Coverage| |ReadTheDocs| |PythonVersion| |PyPI| |Black| |License|

.. |Tests| image:: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml/badge.svg
.. _Tests: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml
:target: https://github.com/GauravPandeyLab/eipy/actions/workflows/tests.yml

.. |Coverage| image:: https://codecov.io/gh/GauravPandeyLab/eipy/graph/badge.svg?token=M2AU2XWJB8
.. _Coverage: https://codecov.io/gh/GauravPandeyLab/eipy
:target: https://codecov.io/gh/GauravPandeyLab/eipy

.. |ReadTheDocs| image:: https://readthedocs.org/projects/eipy/badge/?version=latest
.. _ReadTheDocs: https://eipy.readthedocs.io/en/latest/
:target: https://eipy.readthedocs.io/en/latest/

.. |PyPI| image:: https://img.shields.io/pypi/v/ensemble-integration
:target: https://pypi.org/project/ensemble-integration/

.. |PythonVersion| image:: https://img.shields.io/badge/python-3.8%20%7C%203.9%20%7C%203.10%20%7C%203.11-blue
.. _PythonVersion: https://github.com/GauravPandeyLab/eipy

.. |Black| image:: https://img.shields.io/badge/code%20style-black-000000.svg
.. _Black: https://github.com/psf/black
:target: https://github.com/psf/black

.. |License| image:: https://img.shields.io/badge/License-GPLv3-blue
.. _License: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING
:target: https://github.com/GauravPandeyLab/eipy/blob/main/COPYING


``ensemble-integration``: Integrating multi-modal data for predictive modeling
Expand Down
6 changes: 3 additions & 3 deletions docs/source/development.rst
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Development
===========

We welcome contributions to the development of ``eipy``. To contribute follow the below instructions to submit a pull request:
We welcome contributions to the development of ``ensemble-integration``. To contribute follow the below instructions to submit a pull request:

1. **Install Python**. First of all make sure you have a supported version of Python on your local machine (see `GitHub <https://github.com/GauravPandeyLab/eipy>`__ for supported versions).
2. **Install Poetry**. ``eipy`` uses Poetry to manage dependencies. To install Poetry follow the instructions on their `website <https://python-poetry.org/docs/>`__.
Expand Down Expand Up @@ -49,9 +49,9 @@ Note that new test file names must have the prefix `test_`.
9. **Submit pull request**. Updates must be made via a pull request. Internal users should note that pushing
to the main branch has been disabled.

10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`
10. **Publishing new versions to PyPI** (internal only). We now use `poetry-dynamic-versioning <https://github.com/mtkennerly/poetry-dynamic-versioning>`__
to iterate version numbers in pyproject.toml automatically. You can publish to
PyPI by creating a new `release <https://github.com/GauravPandeyLab/eipy/releases>`__,
which will run the "Publish to PyPI" workflow. This workflow determines the PyPI version number from the
GitHub release tag, which you should manually iterate.
Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
Note: to test things out first, you can try manually running the "Publish to test PyPI" workflow.
44 changes: 44 additions & 0 deletions eipy/additional_ensembles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,18 @@

class MeanAggregation(BaseEstimator, ClassifierMixin):
"""
Mean Aggregation

Trivially takes the mean of X.

Attributes
----------
classes : array
Ordered arrray of unique labels for computing mean.
X_ : array of (n_samples, n_features)
Base predictor data for computing mean.
y_ : array of (n_samples,)
True labels of X_.
"""

def __init__(self):
Expand All @@ -36,7 +47,18 @@ def predict_proba(self, X):

class MedianAggregation(BaseEstimator, ClassifierMixin):
"""
Median Aggregation

Trivially takes the median of X.

Attributes
----------
classes : array
Ordered arrray of unique labels for computing mean.
X_ : array of (n_samples, n_features)
Base predictor data for computing mean.
y_ : array of (n_samples,)
True labels of X_.
"""

def __init__(self):
Expand All @@ -63,6 +85,28 @@ class CES(BaseEstimator, ClassifierMixin):
Caruana R. et al. (2006) Getting the most out of ensemble selection.
In: Sixth International Conference on Data
Mining (ICDM'06), 2006 IEEE, Piscataway, NJ, USA, pp. 828-833.

Parameters
----------
scoring :

max_ensemble_size : int
Maximum number of base models to ensemble.
random_state : int
For determining a random state.
greater_is_better : bool
For sorting models by performance with respect to a metric.

Attributes
----------
selected_ensemble : list
List of models selected for ensemble.
train_performance : list
Record of model performances.
argbest : bool
True if metric of interest is to be maximized. Used for model selection.
best : bool
True if metric of interest is to be maximized. Used for selecting maximum scorers.
"""

def __init__(
Expand Down
4 changes: 2 additions & 2 deletions eipy/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def load_diabetes():
"""
zenodo_link = "https://zenodo.org/records/10035422/files/diabetes.zip?download=1"
# Get data path
data_path = get_data_home()
data_path = _get_data_home()
folder_ext = "diabetes"
data_ext_path = join(data_path, folder_ext)
# check data downloaded before
Expand Down Expand Up @@ -66,7 +66,7 @@ def _load_csv(file_path, fn, suffix):
return pd.read_csv(join(file_path, f"{fn}_{suffix}.csv"), index_col=0)


def get_data_home(data_home=None):
def _get_data_home(data_home=None):
"""Return the path of the eipy data directory.

This function is referring from scikit-learn.
Expand Down
70 changes: 34 additions & 36 deletions eipy/ei.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,21 @@
from joblib import Parallel, delayed
import warnings
from eipy.utils import (
X_is_dict,
X_to_numpy,
y_to_numpy,
set_predictor_seeds,
random_integers,
sample,
retrieve_X_y,
append_modality,
safe_predict_proba,
_X_is_dict,
_X_to_numpy,
_y_to_numpy,
_set_predictor_seeds,
_random_integers,
_sample,
_retrieve_X_y,
_append_modality,
_safe_predict_proba,
dummy_cv,
bar_format,
)
from eipy.metrics import (
base_summary,
ensemble_summary,
_base_summary,
_ensemble_summary,
)

warnings.filterwarnings("ignore", category=DeprecationWarning)
Expand Down Expand Up @@ -181,7 +181,7 @@ def __init__(
self.modality_names = []
self.n_features_per_modality = []

self.random_numbers_for_samples = random_integers(
self.random_numbers_for_samples = _random_integers(
n_integers=n_samples, seed=self.random_state
)
self.feature_names = {}
Expand Down Expand Up @@ -210,17 +210,17 @@ def fit_base(self, X, y, base_predictors=None, modality_name=None):
\n... for ensemble performance analysis..."""
)
# convert y to a numpy array
y = y_to_numpy(y)
y = _y_to_numpy(y)

# check if base_predictors are passed here
if base_predictors is not None:
self.base_predictors = base_predictors # update base predictors

# set random_states in base_predictors
set_predictor_seeds(self.base_predictors, self.random_state)
_set_predictor_seeds(self.base_predictors, self.random_state)

# check data format and train accordingly
if X_is_dict(X):
if _X_is_dict(X):
for modality_name, modality in X.items():
self._fit_base(
X=modality,
Expand Down Expand Up @@ -252,12 +252,12 @@ def fit_ensemble(self, ensemble_predictors=None):
if ensemble_predictors is not None:
self.ensemble_predictors = ensemble_predictors

set_predictor_seeds(self.ensemble_predictors, self.random_state)
_set_predictor_seeds(self.ensemble_predictors, self.random_state)

y_test_combined = []

for fold_id in range(self.k_outer):
_, y_test = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
_, y_test = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
y_test_combined.extend(y_test)

ensemble_predictions = {}
Expand All @@ -270,25 +270,25 @@ def fit_ensemble(self, ensemble_predictors=None):
y_pred_combined = []

for fold_id in range(self.k_outer):
X_train, y_train = retrieve_X_y(
X_train, y_train = _retrieve_X_y(
labelled_data=self.ensemble_training_data[fold_id]
)
X_test, _ = retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])
X_test, _ = _retrieve_X_y(labelled_data=self.ensemble_test_data[fold_id])

if self.sampling_aggregation == "mean":
X_train = X_train.T.groupby(level=[0, 1]).mean().T
X_test = X_test.T.groupby(level=[0, 1]).mean().T

model.fit(X_train, y_train)
y_pred = safe_predict_proba(model, X_test)
y_pred = _safe_predict_proba(model, X_test)
y_pred_combined.extend(y_pred)

ensemble_predictions[model_name] = y_pred_combined

ensemble_predictions["labels"] = y_test_combined

self.ensemble_predictions = pd.DataFrame.from_dict(ensemble_predictions)
self.ensemble_summary = ensemble_summary(
self.ensemble_summary = _ensemble_summary(
self.ensemble_predictions, self.metrics
)

Expand All @@ -298,7 +298,7 @@ def fit_ensemble(self, ensemble_predictors=None):
desc="Training final ensemble models",
bar_format=bar_format,
):
X_train, y_train = retrieve_X_y(
X_train, y_train = _retrieve_X_y(
labelled_data=self.ensemble_training_data_final[0]
)

Expand All @@ -314,7 +314,7 @@ def fit_ensemble(self, ensemble_predictors=None):

def predict(self, X_dict, ensemble_model_key):
"""
Predict class labels for samples in X
Predict class labels for samples in X.

Parameters
----------
Expand All @@ -336,7 +336,7 @@ def predict(self, X_dict, ensemble_model_key):
modality_name = self.modality_names[i]
X = X_dict[modality_name]

X, _ = X_to_numpy(X)
X, _ = _X_to_numpy(X)

base_models = copy.deepcopy(self.final_models["base models"][modality_name])
self.base_predictors = {}
Expand All @@ -345,15 +345,15 @@ def predict(self, X_dict, ensemble_model_key):
self.base_predictors[base_model_dict["model name"]] = 0

base_model = pickle.loads(base_model_dict["pickled model"])
y_pred = safe_predict_proba(base_model, X)
y_pred = _safe_predict_proba(base_model, X)

base_model_dict["fold id"] = 0
base_model_dict["y_pred"] = y_pred

combined_predictions = self._combine_predictions_outer(
base_models, modality_name, model_building=True
)
ensemble_prediction_data = append_modality(
ensemble_prediction_data = _append_modality(
ensemble_prediction_data, combined_predictions, model_building=True
)
ensemble_prediction_data = ensemble_prediction_data[0]
Expand All @@ -367,12 +367,12 @@ def predict(self, X_dict, ensemble_model_key):
self.final_models["ensemble models"][ensemble_model_key]
)

y_pred = safe_predict_proba(ensemble_model, ensemble_prediction_data)
y_pred = _safe_predict_proba(ensemble_model, ensemble_prediction_data)
return y_pred

@ignore_warnings(category=ConvergenceWarning)
def _fit_base(self, X, y, base_predictors=None, modality_name=None):
X, feature_names = X_to_numpy(X)
X, feature_names = _X_to_numpy(X)

self.modality_names.append(modality_name)
self.feature_names[modality_name] = feature_names
Expand All @@ -387,7 +387,7 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
modality_name=modality_name,
)

self.ensemble_training_data = append_modality(
self.ensemble_training_data = _append_modality(
self.ensemble_training_data, ensemble_training_data_modality
)

Expand All @@ -399,12 +399,12 @@ def _fit_base(self, X, y, base_predictors=None, modality_name=None):
modality_name=modality_name,
)

self.ensemble_test_data = append_modality(
self.ensemble_test_data = _append_modality(
self.ensemble_test_data, ensemble_test_data_modality
) # append data to dataframe

# create a summary of base predictor performance
self.base_summary = base_summary(self.ensemble_test_data, self.metrics)
self.base_summary = _base_summary(self.ensemble_test_data, self.metrics)

if self.model_building:
self._fit_base_final(X=X, y=y, modality_name=modality_name)
Expand All @@ -428,7 +428,7 @@ def _fit_base_final(self, X, y, modality_name=None):
modality_name=modality_name,
)

self.ensemble_training_data_final = append_modality(
self.ensemble_training_data_final = _append_modality(
self.ensemble_training_data_final, ensemble_training_data_modality
)

Expand Down Expand Up @@ -562,7 +562,7 @@ def _train_predict_single_base_predictor(

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
X_sample, y_sample = sample(
X_sample, y_sample = _sample(
X_train,
y_train,
strategy=self.sampling_strategy,
Expand All @@ -581,7 +581,7 @@ def _train_predict_single_base_predictor(
}

else:
y_pred = safe_predict_proba(model, X_test)
y_pred = _safe_predict_proba(model, X_test)

results_dict = {
"model name": model_name,
Expand Down Expand Up @@ -677,7 +677,6 @@ def save(self, path=None):

Parameters
----------

path : optional, default=None
Path to save the EnsembleIntegration class object.
"""
Expand All @@ -695,7 +694,6 @@ def load(cls, path):

Parameters
----------

path : str
Path to load the EnsembleIntegration class object.
"""
Expand Down
Loading