From 8d5308db5cb8ee751e1ab65327e5e690b97dacc2 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 22 Nov 2023 18:25:36 +0100 Subject: [PATCH 1/4] feat(HEBO): Added initial support for single/mutli objective HEBO --- src/amltk/optimization/optimizers/hebo.py | 239 ++++++++++++++++++++++ src/amltk/pipeline/parsers/hebo.py | 142 +++++++++++++ tests/optimizers/test_optimizers.py | 28 +++ 3 files changed, 409 insertions(+) create mode 100644 src/amltk/optimization/optimizers/hebo.py create mode 100644 src/amltk/pipeline/parsers/hebo.py diff --git a/src/amltk/optimization/optimizers/hebo.py b/src/amltk/optimization/optimizers/hebo.py new file mode 100644 index 00000000..21c32f29 --- /dev/null +++ b/src/amltk/optimization/optimizers/hebo.py @@ -0,0 +1,239 @@ +"""An optimizer from HEBO to optimize a HEBO design space. + +### In progress +""" +# TODO +# Constraints +# Parallel ask/suggest +# I imagine iterative tell is fine, we don't need concurrent. +# Figure out what other feeatures there are +# Doc of course + +from __future__ import annotations + +from collections.abc import Sequence +from itertools import count +from pathlib import Path +from typing import TYPE_CHECKING, Any, TypeAlias, overload +from typing_extensions import override + +import numpy as np +import pandas as pd +from hebo.design_space.design_space import DesignSpace +from hebo.optimizers.general import GeneralBO +from hebo.optimizers.hebo import HEBO + +import amltk.randomness +from amltk.optimization.metric import Metric +from amltk.optimization.optimizer import Optimizer +from amltk.optimization.trial import Trial +from amltk.pipeline.parsers.hebo import parser +from amltk.store import PathBucket + +if TYPE_CHECKING: + from typing import Protocol + + from hebo.optimizers.abstract_optimizer import AbstractOptimizer + + from amltk.pipeline import Node + from amltk.types import Seed + + class HEBOParser(Protocol): + """A protocol for HEBO design space parser.""" + + def __call__( + self, + node: Node, + *, + flat: bool = False, + delim: str = ":", + ) -> DesignSpace: + """See [`hebo`][amltk.pipeline.parsers.hebo.parser].""" + ... + + +HEBOTrial: TypeAlias = pd.DataFrame +"""HEBO uses dataframes internally.""" + + +class HEBOOptimizer(Optimizer[HEBOTrial]): + """An optimizer that uses HEBO to optimize a HEBO design space.""" + + def __init__( + self, + optimizer: AbstractOptimizer, + metrics: Metric | Sequence[Metric], + bucket: PathBucket | None = None, + seed: Seed | None = None, + ) -> None: + """Initialize the optimizer. + + Args: + optimizer: The HEBO optimizer. + metrics: The metrics to optimize. + bucket: The bucket to store results of individual trials from this + optimizer. + seed: The seed to use for trials generated from this optimizer. + """ + metrics = metrics if isinstance(metrics, Sequence) else [metrics] + super().__init__(metrics=metrics, bucket=bucket) + self.optimizer = optimizer + self.seed = seed + + # TODO: If HEBO does multi-fidelity or some other kinds of optimization, + # this may not be sufficient + self.name_generator = iter(f"trial_{i}" for i in count()) + + @override + def tell(self, report: Trial.Report[HEBOTrial]) -> None: + """Tell the optimizer the report for an asked trial. + + Args: + report: The report for a trial + """ + raw_x = report.trial.info + assert raw_x is not None + + # NOTE: Given a trial fail/crashed, we will have inf/worst or None for each + # metric. As long as we fill in any missing metrics and maintain metric order, + # than HEBO should be fine with these reported. + # Either way, we don't actually have to look at the status of the trial to give + # the info to hebo. + + # Make sure we have a value for each + _lookup: dict[str, Metric.Value] = { + v.metric.name: v for v in report.metric_values + } + metric_values = [ + _lookup.get(metric.name, metric.worst) for metric in self.metrics + ] + raw_y = np.array([[v.value for v in metric_values]]) + self.optimizer.observe(raw_x, raw_y) + + @override + @overload + def ask(self, *, n_suggestions: int) -> list[Trial[HEBOTrial]]: + ... + + @override + @overload + def ask(self, *, n_suggestions: None = None) -> Trial[HEBOTrial]: + ... + + @override + def ask( + self, + *, + n_suggestions: int | None = None, + fix_input: dict[str, Any] | None = None, + ) -> Trial[HEBOTrial] | list[Trial[HEBOTrial]]: + """Ask the optimizer for a trial to evaluate. + + Returns: + A Trial + """ + if fix_input is not None: + # TODO: Probably fine to implement but not a priority + # right now. + raise NotImplementedError( + "fix_input not yet supported for HEBOOptimizer", + ) + + match n_suggestions: + # TODO: Allow multiple suggestions per iteration + case int(): + raise NotImplementedError( + "Multiple suggestions per iteration not yet supported", + ) + case None: + # NOTE: Assuming for now that if I suggest without + # anything, i.e. `n_suggestions = 1`, then I get a + # single row dataframe. + df_config: pd.DataFrame = self.optimizer.suggest() # type: ignore + assert isinstance(df_config, pd.DataFrame) + assert len(df_config) == 1 + + config: dict[str, Any] = df_config.iloc[0].to_dict() + + return Trial( + name=next(self.name_generator), + config=config, + bucket=self.bucket, + metrics=self.metrics, + info=df_config, + seed=amltk.randomness.as_int(self.seed), + ) + case _: # type: ignore + raise ValueError(f"{n_suggestions=} must be `None` or `int > 0`") + + @classmethod + def create( + cls, + *, + space: Node | DesignSpace, + metrics: Metric | Sequence[Metric], + seed: Seed | None = None, + bucket: PathBucket | str | Path | None = None, + **optimizer_kwargs: Any, + ) -> HEBOOptimizer: + """Create an optimizer from HEBO. + + Args: + space: The space to search over + metrics: The metrics to optimize. + + * If `Metric`, then this is a single objective optimization with `HEBO`. + * If `Sequence[Metric]`, then this is a multi-objective optimization + with `GeneralBO`. + + seed: The seed to use for trials generated from this optimizer. + bucket: The bucket to store results of individual trials from this + optimizer. + **optimizer_kwargs: Keyword arguments to pass to the optimizer constructed. + + Returns: + A HEBOOptimizer. + """ + # TODO: Since hebo in it's observe will ignore anything that's inf, we can + # not have metrics that have an unbounded best, as these would get ignored... + # Probably need to raise an issue. + # Not really sure how to report or handle that case though as it's only a + # theoretical problem. + _check_metrics = [metrics] if isinstance(metrics, Metric) else metrics + if any(np.isinf(metric.optimal.value) for metric in _check_metrics): + raise ValueError( + "HEBO doesn't support metrics with an unbounded optimal value i.e. inf", + ) + + scramble_seed = amltk.randomness.as_int(seed) + + if isinstance(bucket, str | Path): + bucket = PathBucket(bucket) + + space = space if isinstance(space, DesignSpace) else parser(space) + + match metrics: + case Metric() | [Metric()]: + optimizer = HEBO( + space=space, + scramble_seed=scramble_seed, + **optimizer_kwargs, + ) + case Sequence(): + assert len(metrics) > 1 + # TODO: Not really sure if I should give a ref point or not, especially + # if there are unbounded metrics. + ref_point = np.array([metric.worst.value for metric in metrics]) + optimizer = GeneralBO( + space=space, + num_obj=len(metrics), + ref_point=ref_point, + **optimizer_kwargs, + ) + + return cls(optimizer=optimizer, metrics=metrics, bucket=bucket, seed=seed) + + @override + @classmethod + def preferred_parser(cls) -> HEBOParser: + return parser diff --git a/src/amltk/pipeline/parsers/hebo.py b/src/amltk/pipeline/parsers/hebo.py new file mode 100644 index 00000000..d43dc3fc --- /dev/null +++ b/src/amltk/pipeline/parsers/hebo.py @@ -0,0 +1,142 @@ +"""TODO.""" +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, TypeAlias + +import numpy as np +from hebo.design_space.design_space import DesignSpace + +if TYPE_CHECKING: + from amltk.pipeline import Node + +HP: TypeAlias = dict[str, Any] + +PAIR = 2 + + +def _parse_hp( + node_name: str, + *, + hp_name: str, + hp: tuple | list | Mapping, + delim: str = ":", +) -> HP: + new_hp_name = f"{node_name}{delim}{hp_name}" + match hp: + # If the name in a dict does not match what we see in the `space` dict, raise + case {"name": _name_in_dict} if _name_in_dict != hp_name: + raise ValueError( + f'Can\'t have "name" in {hp=} as it is already given the {hp_name=}.', + ) + # Otherwise it's a dictionary with either the same name or no name, either case + # we give it a new name prefixed by the nodes name + case Mapping(): + return {**hp, "name": new_hp_name} + # Bounded int/float + case tuple() as tup if len(tup) == PAIR: + match tup: + case (int() | np.integer(), int() | np.integer()): + x, y = tup + return { + "name": new_hp_name, + "type": "int", + "lb": int(x), + "ub": int(y), + } + case (float() | np.floating(), float() | np.floating()): + x, y = tup + return { + "name": new_hp_name, + "type": "num", + "lb": float(x), + "ub": float(y), + } + case (x, y): + raise ValueError( + f"Expected {hp_name} to have same type for lower/upper bound," + f"got lower: {type(x)}, upper: {type(y)}.", + ) + # Bool param + case (one, two) if isinstance(one, bool) and isinstance(two, bool): + return {"name": hp_name, "type": "bool"} + # Categorical param + case list() if all(isinstance(item, str) for item in hp): + return {"name": hp_name, "type": "cat", "categories": hp} + # Constant value + case (one,) if isinstance(one, int | float | str | bool): + return {"name": hp_name, "type": "cat", "categories": [one]} + case _: + raise ValueError( + f"Could not parse {hp_name} as a valid HEBO distribution.\n{hp=}", + ) + + raise ValueError(f"Could not parse {hp_name} as a valid HEBO distribution.\n{hp=}") + + +def _parse_space(node: Node, *, flat: bool = False, delim: str = ":") -> dict[str, HP]: + match node.space: + case None: + space = {} + case list(): + space = {hp["name"]: hp for hp in node.space} + case Mapping(): + space = { + name: _parse_hp(node_name=node.name, hp_name=name, hp=hp, delim=delim) + for name, hp in node.space.items() + } + case _: + raise ValueError( + f"Can't parse {node.space=} as a HEBO space for node {node.name=}.", + ) + + for child in node.nodes: + subspace: dict[str, HP] = _parse_space(child) + if not flat: + _prefix = lambda _hp_name: f"{node.name}{delim}{_hp_name}" + subspace = { + _prefix(hp_name): {**hp, "name": _prefix(hp_name)} + for hp_name, hp in subspace.items() + } + + for hp_name, hp in subspace.items(): + if hp_name in space: + raise ValueError( + f"Duplicate name {hp_name} already in space from space of " + f"{node.name}\nCurrently parsed space: {space}", + ) + space[hp_name] = hp + + return space + + +def parser( + node: Node, + *, + flat: bool = False, + conditionals: bool = False, + delim: str = ":", +) -> DesignSpace: + """Parse a Node and its children into a hebo DesignSpace. + + Args: + node: The Node to parse + flat: Whether to have a heirarchical naming scheme for nodes and their children. + conditionals: Whether to include conditionals in the space from a + [`Choice`][amltk.pipeline.Choice]. If this is `False`, this will + also remove all forbidden clauses and other conditional clauses. + The primary use of this functionality is that some optimizers do not + support these features. + + !!! warning "Not yet supported" + + This functionality is not yet supported in HEBO + + delim: The delimiter to use for the names of the hyperparameters + """ + if conditionals: + raise NotImplementedError("Conditionals are not yet supported with HEBO.") + + space = _parse_space(node=node, flat=flat, delim=delim) + hp_values = list(space.values()) + return DesignSpace().parse(hp_values) diff --git a/tests/optimizers/test_optimizers.py b/tests/optimizers/test_optimizers.py index 478c7d16..005c66bc 100644 --- a/tests/optimizers/test_optimizers.py +++ b/tests/optimizers/test_optimizers.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING +import numpy as np import pytest from pytest_cases import case, parametrize, parametrize_with_cases @@ -12,6 +13,7 @@ from amltk.profiling import Timer if TYPE_CHECKING: + from amltk.optimization.optimizers.hebo import HEBOOptimizer from amltk.optimization.optimizers.neps import NEPSOptimizer from amltk.optimization.optimizers.optuna import OptunaOptimizer from amltk.optimization.optimizers.smac import SMACOptimizer @@ -86,6 +88,32 @@ def opt_optuna(metric: Metric, tmp_path: Path) -> OptunaOptimizer: ) +# NOTE: HEBO does not support unbounded optimals in metrics +hebo_metrics = [ + Metric("score_bounded", minimize=False, bounds=(0, 1)), + Metric("score_unbounded", minimize=False, bounds=(-np.inf, 10)), + Metric("loss_unbounded", minimize=True, bounds=(-10, np.inf)), + Metric("loss_bounded", minimize=True, bounds=(-1, 1)), +] + + +@case +@parametrize("metric", [*hebo_metrics, hebo_metrics]) # Single obj and multi +def opt_hebo(metric: Metric, tmp_path: Path) -> HEBOOptimizer: + try: + from amltk.optimization.optimizers.hebo import HEBOOptimizer + except ImportError: + pytest.skip("HEBO is not installed") + + pipeline = Component(_A, name="hi", space={"a": (1, 10)}) + return HEBOOptimizer.create( + space=pipeline, + metrics=metric, + seed=42, + bucket=tmp_path, + ) + + @case @parametrize("metric", [*metrics]) # Single obj def opt_neps(metric: Metric, tmp_path: Path) -> NEPSOptimizer: From 2fa5f79f900e5d9d2c2df4bd3d950f1c0acc4764 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 22 Nov 2023 18:35:22 +0100 Subject: [PATCH 2/4] fix(HEBO): Make sure to report costs always --- src/amltk/optimization/optimizers/hebo.py | 45 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/amltk/optimization/optimizers/hebo.py b/src/amltk/optimization/optimizers/hebo.py index 21c32f29..d7683120 100644 --- a/src/amltk/optimization/optimizers/hebo.py +++ b/src/amltk/optimization/optimizers/hebo.py @@ -99,7 +99,6 @@ def tell(self, report: Trial.Report[HEBOTrial]) -> None: # than HEBO should be fine with these reported. # Either way, we don't actually have to look at the status of the trial to give # the info to hebo. - # Make sure we have a value for each _lookup: dict[str, Metric.Value] = { v.metric.name: v for v in report.metric_values @@ -107,7 +106,9 @@ def tell(self, report: Trial.Report[HEBOTrial]) -> None: metric_values = [ _lookup.get(metric.name, metric.worst) for metric in self.metrics ] - raw_y = np.array([[v.value for v in metric_values]]) + + costs = [self.cost(v) for v in metric_values] + raw_y = np.array([costs]) # Yep, it needs 2d, for single report tells self.optimizer.observe(raw_x, raw_y) @override @@ -221,13 +222,12 @@ def create( ) case Sequence(): assert len(metrics) > 1 - # TODO: Not really sure if I should give a ref point or not, especially - # if there are unbounded metrics. - ref_point = np.array([metric.worst.value for metric in metrics]) optimizer = GeneralBO( space=space, num_obj=len(metrics), - ref_point=ref_point, + # TODO: Not really sure if I should give a ref point or not, + # especially if there are unbounded metrics. + ref_point=np.array(cls.worst_possible_cost(metrics)), **optimizer_kwargs, ) @@ -237,3 +237,36 @@ def create( @classmethod def preferred_parser(cls) -> HEBOParser: return parser + + @overload + @classmethod + def worst_possible_cost(cls, metric: Metric) -> float: + ... + + @overload + @classmethod + def worst_possible_cost(cls, metric: Sequence[Metric]) -> list[float]: + ... + + @classmethod + def worst_possible_cost( + cls, + metric: Metric | Sequence[Metric], + ) -> float | list[float]: + """Get the crash cost for a metric for SMAC.""" + match metric: + case Metric(bounds=(lower, upper)): # Bounded metrics + return abs(upper - lower) + case Metric(): # Unbounded metric + return np.inf + case metrics: + return [cls.worst_possible_cost(m) for m in metrics] + + @classmethod + def cost(cls, value: Metric.Value) -> float: + """Get the cost for a metric value for HEBO.""" + match value.distance_to_optimal: + case None: # If we can't compute the distance, use the loss + return value.loss + case distance: # If we can compute the distance, use that + return distance From 74d9c7f591a211923f71063e3fecf9127c032358 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 22 Nov 2023 18:25:36 +0100 Subject: [PATCH 3/4] feat(HEBO): Added initial support for single/mutli objective HEBO --- src/amltk/optimization/optimizers/hebo.py | 239 ++++++++++++++++++++++ src/amltk/pipeline/parsers/hebo.py | 142 +++++++++++++ tests/optimizers/test_optimizers.py | 28 +++ 3 files changed, 409 insertions(+) create mode 100644 src/amltk/optimization/optimizers/hebo.py create mode 100644 src/amltk/pipeline/parsers/hebo.py diff --git a/src/amltk/optimization/optimizers/hebo.py b/src/amltk/optimization/optimizers/hebo.py new file mode 100644 index 00000000..21c32f29 --- /dev/null +++ b/src/amltk/optimization/optimizers/hebo.py @@ -0,0 +1,239 @@ +"""An optimizer from HEBO to optimize a HEBO design space. + +### In progress +""" +# TODO +# Constraints +# Parallel ask/suggest +# I imagine iterative tell is fine, we don't need concurrent. +# Figure out what other feeatures there are +# Doc of course + +from __future__ import annotations + +from collections.abc import Sequence +from itertools import count +from pathlib import Path +from typing import TYPE_CHECKING, Any, TypeAlias, overload +from typing_extensions import override + +import numpy as np +import pandas as pd +from hebo.design_space.design_space import DesignSpace +from hebo.optimizers.general import GeneralBO +from hebo.optimizers.hebo import HEBO + +import amltk.randomness +from amltk.optimization.metric import Metric +from amltk.optimization.optimizer import Optimizer +from amltk.optimization.trial import Trial +from amltk.pipeline.parsers.hebo import parser +from amltk.store import PathBucket + +if TYPE_CHECKING: + from typing import Protocol + + from hebo.optimizers.abstract_optimizer import AbstractOptimizer + + from amltk.pipeline import Node + from amltk.types import Seed + + class HEBOParser(Protocol): + """A protocol for HEBO design space parser.""" + + def __call__( + self, + node: Node, + *, + flat: bool = False, + delim: str = ":", + ) -> DesignSpace: + """See [`hebo`][amltk.pipeline.parsers.hebo.parser].""" + ... + + +HEBOTrial: TypeAlias = pd.DataFrame +"""HEBO uses dataframes internally.""" + + +class HEBOOptimizer(Optimizer[HEBOTrial]): + """An optimizer that uses HEBO to optimize a HEBO design space.""" + + def __init__( + self, + optimizer: AbstractOptimizer, + metrics: Metric | Sequence[Metric], + bucket: PathBucket | None = None, + seed: Seed | None = None, + ) -> None: + """Initialize the optimizer. + + Args: + optimizer: The HEBO optimizer. + metrics: The metrics to optimize. + bucket: The bucket to store results of individual trials from this + optimizer. + seed: The seed to use for trials generated from this optimizer. + """ + metrics = metrics if isinstance(metrics, Sequence) else [metrics] + super().__init__(metrics=metrics, bucket=bucket) + self.optimizer = optimizer + self.seed = seed + + # TODO: If HEBO does multi-fidelity or some other kinds of optimization, + # this may not be sufficient + self.name_generator = iter(f"trial_{i}" for i in count()) + + @override + def tell(self, report: Trial.Report[HEBOTrial]) -> None: + """Tell the optimizer the report for an asked trial. + + Args: + report: The report for a trial + """ + raw_x = report.trial.info + assert raw_x is not None + + # NOTE: Given a trial fail/crashed, we will have inf/worst or None for each + # metric. As long as we fill in any missing metrics and maintain metric order, + # than HEBO should be fine with these reported. + # Either way, we don't actually have to look at the status of the trial to give + # the info to hebo. + + # Make sure we have a value for each + _lookup: dict[str, Metric.Value] = { + v.metric.name: v for v in report.metric_values + } + metric_values = [ + _lookup.get(metric.name, metric.worst) for metric in self.metrics + ] + raw_y = np.array([[v.value for v in metric_values]]) + self.optimizer.observe(raw_x, raw_y) + + @override + @overload + def ask(self, *, n_suggestions: int) -> list[Trial[HEBOTrial]]: + ... + + @override + @overload + def ask(self, *, n_suggestions: None = None) -> Trial[HEBOTrial]: + ... + + @override + def ask( + self, + *, + n_suggestions: int | None = None, + fix_input: dict[str, Any] | None = None, + ) -> Trial[HEBOTrial] | list[Trial[HEBOTrial]]: + """Ask the optimizer for a trial to evaluate. + + Returns: + A Trial + """ + if fix_input is not None: + # TODO: Probably fine to implement but not a priority + # right now. + raise NotImplementedError( + "fix_input not yet supported for HEBOOptimizer", + ) + + match n_suggestions: + # TODO: Allow multiple suggestions per iteration + case int(): + raise NotImplementedError( + "Multiple suggestions per iteration not yet supported", + ) + case None: + # NOTE: Assuming for now that if I suggest without + # anything, i.e. `n_suggestions = 1`, then I get a + # single row dataframe. + df_config: pd.DataFrame = self.optimizer.suggest() # type: ignore + assert isinstance(df_config, pd.DataFrame) + assert len(df_config) == 1 + + config: dict[str, Any] = df_config.iloc[0].to_dict() + + return Trial( + name=next(self.name_generator), + config=config, + bucket=self.bucket, + metrics=self.metrics, + info=df_config, + seed=amltk.randomness.as_int(self.seed), + ) + case _: # type: ignore + raise ValueError(f"{n_suggestions=} must be `None` or `int > 0`") + + @classmethod + def create( + cls, + *, + space: Node | DesignSpace, + metrics: Metric | Sequence[Metric], + seed: Seed | None = None, + bucket: PathBucket | str | Path | None = None, + **optimizer_kwargs: Any, + ) -> HEBOOptimizer: + """Create an optimizer from HEBO. + + Args: + space: The space to search over + metrics: The metrics to optimize. + + * If `Metric`, then this is a single objective optimization with `HEBO`. + * If `Sequence[Metric]`, then this is a multi-objective optimization + with `GeneralBO`. + + seed: The seed to use for trials generated from this optimizer. + bucket: The bucket to store results of individual trials from this + optimizer. + **optimizer_kwargs: Keyword arguments to pass to the optimizer constructed. + + Returns: + A HEBOOptimizer. + """ + # TODO: Since hebo in it's observe will ignore anything that's inf, we can + # not have metrics that have an unbounded best, as these would get ignored... + # Probably need to raise an issue. + # Not really sure how to report or handle that case though as it's only a + # theoretical problem. + _check_metrics = [metrics] if isinstance(metrics, Metric) else metrics + if any(np.isinf(metric.optimal.value) for metric in _check_metrics): + raise ValueError( + "HEBO doesn't support metrics with an unbounded optimal value i.e. inf", + ) + + scramble_seed = amltk.randomness.as_int(seed) + + if isinstance(bucket, str | Path): + bucket = PathBucket(bucket) + + space = space if isinstance(space, DesignSpace) else parser(space) + + match metrics: + case Metric() | [Metric()]: + optimizer = HEBO( + space=space, + scramble_seed=scramble_seed, + **optimizer_kwargs, + ) + case Sequence(): + assert len(metrics) > 1 + # TODO: Not really sure if I should give a ref point or not, especially + # if there are unbounded metrics. + ref_point = np.array([metric.worst.value for metric in metrics]) + optimizer = GeneralBO( + space=space, + num_obj=len(metrics), + ref_point=ref_point, + **optimizer_kwargs, + ) + + return cls(optimizer=optimizer, metrics=metrics, bucket=bucket, seed=seed) + + @override + @classmethod + def preferred_parser(cls) -> HEBOParser: + return parser diff --git a/src/amltk/pipeline/parsers/hebo.py b/src/amltk/pipeline/parsers/hebo.py new file mode 100644 index 00000000..d43dc3fc --- /dev/null +++ b/src/amltk/pipeline/parsers/hebo.py @@ -0,0 +1,142 @@ +"""TODO.""" +from __future__ import annotations + +from collections.abc import Mapping +from typing import TYPE_CHECKING, Any, TypeAlias + +import numpy as np +from hebo.design_space.design_space import DesignSpace + +if TYPE_CHECKING: + from amltk.pipeline import Node + +HP: TypeAlias = dict[str, Any] + +PAIR = 2 + + +def _parse_hp( + node_name: str, + *, + hp_name: str, + hp: tuple | list | Mapping, + delim: str = ":", +) -> HP: + new_hp_name = f"{node_name}{delim}{hp_name}" + match hp: + # If the name in a dict does not match what we see in the `space` dict, raise + case {"name": _name_in_dict} if _name_in_dict != hp_name: + raise ValueError( + f'Can\'t have "name" in {hp=} as it is already given the {hp_name=}.', + ) + # Otherwise it's a dictionary with either the same name or no name, either case + # we give it a new name prefixed by the nodes name + case Mapping(): + return {**hp, "name": new_hp_name} + # Bounded int/float + case tuple() as tup if len(tup) == PAIR: + match tup: + case (int() | np.integer(), int() | np.integer()): + x, y = tup + return { + "name": new_hp_name, + "type": "int", + "lb": int(x), + "ub": int(y), + } + case (float() | np.floating(), float() | np.floating()): + x, y = tup + return { + "name": new_hp_name, + "type": "num", + "lb": float(x), + "ub": float(y), + } + case (x, y): + raise ValueError( + f"Expected {hp_name} to have same type for lower/upper bound," + f"got lower: {type(x)}, upper: {type(y)}.", + ) + # Bool param + case (one, two) if isinstance(one, bool) and isinstance(two, bool): + return {"name": hp_name, "type": "bool"} + # Categorical param + case list() if all(isinstance(item, str) for item in hp): + return {"name": hp_name, "type": "cat", "categories": hp} + # Constant value + case (one,) if isinstance(one, int | float | str | bool): + return {"name": hp_name, "type": "cat", "categories": [one]} + case _: + raise ValueError( + f"Could not parse {hp_name} as a valid HEBO distribution.\n{hp=}", + ) + + raise ValueError(f"Could not parse {hp_name} as a valid HEBO distribution.\n{hp=}") + + +def _parse_space(node: Node, *, flat: bool = False, delim: str = ":") -> dict[str, HP]: + match node.space: + case None: + space = {} + case list(): + space = {hp["name"]: hp for hp in node.space} + case Mapping(): + space = { + name: _parse_hp(node_name=node.name, hp_name=name, hp=hp, delim=delim) + for name, hp in node.space.items() + } + case _: + raise ValueError( + f"Can't parse {node.space=} as a HEBO space for node {node.name=}.", + ) + + for child in node.nodes: + subspace: dict[str, HP] = _parse_space(child) + if not flat: + _prefix = lambda _hp_name: f"{node.name}{delim}{_hp_name}" + subspace = { + _prefix(hp_name): {**hp, "name": _prefix(hp_name)} + for hp_name, hp in subspace.items() + } + + for hp_name, hp in subspace.items(): + if hp_name in space: + raise ValueError( + f"Duplicate name {hp_name} already in space from space of " + f"{node.name}\nCurrently parsed space: {space}", + ) + space[hp_name] = hp + + return space + + +def parser( + node: Node, + *, + flat: bool = False, + conditionals: bool = False, + delim: str = ":", +) -> DesignSpace: + """Parse a Node and its children into a hebo DesignSpace. + + Args: + node: The Node to parse + flat: Whether to have a heirarchical naming scheme for nodes and their children. + conditionals: Whether to include conditionals in the space from a + [`Choice`][amltk.pipeline.Choice]. If this is `False`, this will + also remove all forbidden clauses and other conditional clauses. + The primary use of this functionality is that some optimizers do not + support these features. + + !!! warning "Not yet supported" + + This functionality is not yet supported in HEBO + + delim: The delimiter to use for the names of the hyperparameters + """ + if conditionals: + raise NotImplementedError("Conditionals are not yet supported with HEBO.") + + space = _parse_space(node=node, flat=flat, delim=delim) + hp_values = list(space.values()) + return DesignSpace().parse(hp_values) diff --git a/tests/optimizers/test_optimizers.py b/tests/optimizers/test_optimizers.py index 478c7d16..005c66bc 100644 --- a/tests/optimizers/test_optimizers.py +++ b/tests/optimizers/test_optimizers.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import TYPE_CHECKING +import numpy as np import pytest from pytest_cases import case, parametrize, parametrize_with_cases @@ -12,6 +13,7 @@ from amltk.profiling import Timer if TYPE_CHECKING: + from amltk.optimization.optimizers.hebo import HEBOOptimizer from amltk.optimization.optimizers.neps import NEPSOptimizer from amltk.optimization.optimizers.optuna import OptunaOptimizer from amltk.optimization.optimizers.smac import SMACOptimizer @@ -86,6 +88,32 @@ def opt_optuna(metric: Metric, tmp_path: Path) -> OptunaOptimizer: ) +# NOTE: HEBO does not support unbounded optimals in metrics +hebo_metrics = [ + Metric("score_bounded", minimize=False, bounds=(0, 1)), + Metric("score_unbounded", minimize=False, bounds=(-np.inf, 10)), + Metric("loss_unbounded", minimize=True, bounds=(-10, np.inf)), + Metric("loss_bounded", minimize=True, bounds=(-1, 1)), +] + + +@case +@parametrize("metric", [*hebo_metrics, hebo_metrics]) # Single obj and multi +def opt_hebo(metric: Metric, tmp_path: Path) -> HEBOOptimizer: + try: + from amltk.optimization.optimizers.hebo import HEBOOptimizer + except ImportError: + pytest.skip("HEBO is not installed") + + pipeline = Component(_A, name="hi", space={"a": (1, 10)}) + return HEBOOptimizer.create( + space=pipeline, + metrics=metric, + seed=42, + bucket=tmp_path, + ) + + @case @parametrize("metric", [*metrics]) # Single obj def opt_neps(metric: Metric, tmp_path: Path) -> NEPSOptimizer: From 5375b32c8e1a1e6fbb80bd51c2c02c48da82b5f3 Mon Sep 17 00:00:00 2001 From: eddiebergman Date: Wed, 22 Nov 2023 18:35:22 +0100 Subject: [PATCH 4/4] fix(HEBO): Make sure to report costs always --- src/amltk/optimization/optimizers/hebo.py | 45 ++++++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/src/amltk/optimization/optimizers/hebo.py b/src/amltk/optimization/optimizers/hebo.py index 21c32f29..d7683120 100644 --- a/src/amltk/optimization/optimizers/hebo.py +++ b/src/amltk/optimization/optimizers/hebo.py @@ -99,7 +99,6 @@ def tell(self, report: Trial.Report[HEBOTrial]) -> None: # than HEBO should be fine with these reported. # Either way, we don't actually have to look at the status of the trial to give # the info to hebo. - # Make sure we have a value for each _lookup: dict[str, Metric.Value] = { v.metric.name: v for v in report.metric_values @@ -107,7 +106,9 @@ def tell(self, report: Trial.Report[HEBOTrial]) -> None: metric_values = [ _lookup.get(metric.name, metric.worst) for metric in self.metrics ] - raw_y = np.array([[v.value for v in metric_values]]) + + costs = [self.cost(v) for v in metric_values] + raw_y = np.array([costs]) # Yep, it needs 2d, for single report tells self.optimizer.observe(raw_x, raw_y) @override @@ -221,13 +222,12 @@ def create( ) case Sequence(): assert len(metrics) > 1 - # TODO: Not really sure if I should give a ref point or not, especially - # if there are unbounded metrics. - ref_point = np.array([metric.worst.value for metric in metrics]) optimizer = GeneralBO( space=space, num_obj=len(metrics), - ref_point=ref_point, + # TODO: Not really sure if I should give a ref point or not, + # especially if there are unbounded metrics. + ref_point=np.array(cls.worst_possible_cost(metrics)), **optimizer_kwargs, ) @@ -237,3 +237,36 @@ def create( @classmethod def preferred_parser(cls) -> HEBOParser: return parser + + @overload + @classmethod + def worst_possible_cost(cls, metric: Metric) -> float: + ... + + @overload + @classmethod + def worst_possible_cost(cls, metric: Sequence[Metric]) -> list[float]: + ... + + @classmethod + def worst_possible_cost( + cls, + metric: Metric | Sequence[Metric], + ) -> float | list[float]: + """Get the crash cost for a metric for SMAC.""" + match metric: + case Metric(bounds=(lower, upper)): # Bounded metrics + return abs(upper - lower) + case Metric(): # Unbounded metric + return np.inf + case metrics: + return [cls.worst_possible_cost(m) for m in metrics] + + @classmethod + def cost(cls, value: Metric.Value) -> float: + """Get the cost for a metric value for HEBO.""" + match value.distance_to_optimal: + case None: # If we can't compute the distance, use the loss + return value.loss + case distance: # If we can compute the distance, use that + return distance