diff --git a/optimizer/_plot.py b/optimizer/_plot.py index 38802af..75ffae8 100644 --- a/optimizer/_plot.py +++ b/optimizer/_plot.py @@ -1,22 +1,43 @@ -from matplotlib.ticker import MaxNLocator -import matplotlib.pylab as plt +import matplotlib.pyplot as plt +import optuna import seaborn as sns +from matplotlib.ticker import MaxNLocator -import numpy as np - +def _plot_progress(opt: optuna.Trial, + color: str = '#eeaa24') -> None: + """ + Plot the optimization progress of an Optuna study. + Parameters + ---------- + opt : BaseStudy + The Optuna study object to plot. + color : str, optional + The color of the regression line in the plot. Default is '#eeaa24'. -def _plot_progress(opt, marker='.', color='#eeaa24', alpha=0.8): + Returns + ------- + Nothing: + None + """ - fig, ax = plt.subplots(1,1) + # Create a new figure with one subplot + fig, ax = plt.subplots(1, 1) + # Set the title and labels for the plot ax.set_title(type(opt).__name__) ax.set_xlabel('iters') ax.set_ylabel('score') + # Get the trials from the Optuna study object trials = opt.trials_ - sns.regplot(trials.index+1, 'score', trials, color=color) + # Plot a regression line of the score over the index of each trial + sns.regplot(trials.index + 1, 'score', trials, color=color) + # Set the x-axis tick locator to only show integers ax.xaxis.set_major_locator(MaxNLocator(integer=True)) - fig.show() + + # Display the plot + plt.show() + diff --git a/selector/base.py b/selector/base.py index 1a4094d..a812156 100644 --- a/selector/base.py +++ b/selector/base.py @@ -1,25 +1,35 @@ -import pandas as pd -import numpy as np import abc - from copy import copy from time import time +from typing import List, Optional, Callable, Dict, Any, Union +import numpy as np +import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin -from sklearn.utils.random import check_random_state +from sklearn.exceptions import NotFittedError +from sklearn.utils import check_random_state from robusta.crossval import crossval - -from ._verbose import _print_last -from ._subset import FeatureSubset -from ._plot import _plot_progress, _plot_subset - - - +from robusta.utils._subset import FeatureSubset +from robusta.utils._plot import _plot_progress, _plot_subset +from robusta.utils._verbose import _print_last class _Selector(BaseEstimator, TransformerMixin): + """A base class for feature selection transformers. + Attributes + ---------- + features_ : FeatureSubset + The selected features to be used in `transform`. + + Methods + ------- + transform(X: pd.DataFrame) -> pd.DataFrame: + Reduce X to the selected features. + get_subset() -> List[str]: + Get list of columns to select. + """ def transform(self, X): """Reduce X to the selected features. @@ -123,8 +133,7 @@ def _eval_subset(self, subset, X, y, groups): - def eval_subset(self, subset, X, y, groups=None): - + def eval_subset(self, subset, X, y, groups=None): # Convert to FeatureSubset if type(subset) != type(self.features_): subset = self.features_.copy().set_subset(subset) @@ -146,7 +155,8 @@ def eval_subset(self, subset, X, y, groups=None): self.trials_.append(subset) # Verbose - _print_last(self) + if self.verbose: + print(subset) # Check limits self._check_max_iter() @@ -158,14 +168,16 @@ def eval_subset(self, subset, X, y, groups=None): def _check_max_iter(self): if hasattr(self, 'max_iter') and self.max_iter: if self.max_iter <= self.n_iters_: - if self.verbose: print('Iterations limit exceed!') + if self.verbose: + print('Iterations limit exceeded!') raise KeyboardInterrupt def _check_max_time(self): if hasattr(self, 'max_time') and self.max_time: if self.max_time <= self.total_time_: - if self.verbose: print('Time limit exceed!') + if self.verbose: + print('Time limit exceeded!') raise KeyboardInterrupt @@ -178,18 +190,18 @@ def n_iters_(self): return len(self.trials_) - #@property - #def feature_importances_(self): - # subset = self._select_features() - # trial = _find_trial(subset) - # return pd.Series(trial['importance'], index=self.features_) + @property + def feature_importances_(self): + subset = self._select_features() + trial = _find_trial(subset) + return pd.Series(trial['importance'], index=self.features_) - #@property - #def feature_importances_std_(self): - # subset = self._select_features() - # trial = _find_trial(subset) - # return pd.Series(trial['importance_std'], index=self.features_) + @property + def feature_importances_std_(self): + subset = self._select_features() + trial = _find_trial(subset) + return pd.Series(trial['importance_std'], index=self.features_) def plot_progress(self, **kwargs): @@ -199,7 +211,6 @@ def plot_subset(self, **kwargs): return _plot_subset(self, **kwargs) def get_subset(self): - if hasattr(self, 'best_subset_'): return self.best_subset_ else: @@ -207,35 +218,27 @@ def get_subset(self): raise NotFittedError(f'{model_name} is not fitted') - - def _check_k_features(k_features, n_features, param='k_features'): - if isinstance(k_features, int): if k_features > 0: k_features = k_features else: raise ValueError(f'Integer <{param}> must be greater than 0') - elif isinstance(k_features, float): if 0 < k_features < 1: k_features = max(k_features * n_features, 1) k_features = int(k_features) else: raise ValueError(f'Float <{param}> must be from interval (0, 1)') - else: raise ValueError(f'Parameter <{param}> must be int or float,' f'got {k_features}') - return k_features - - -class _WrappedGroupSelector: - def _get_importance(subset, - result): +class WrappedGroupSelector: + @staticmethod + def _get_importance(subset, result): if 'importance' in result: features, imp = result['features'], result['importance'] groups = [group for group, _ in features] @@ -243,8 +246,7 @@ def _get_importance(subset, imp = pd.DataFrame(imp, columns=groups).T imp = imp.groupby(groups).sum() - subset.importance = imp.mean(axis=1) - subset.importance_std = imp.std(axis=1) + subset.importance = importance_std = imp.std(axis=1) return subset def _set_features(self, X):