diff --git a/docs/source/algorithms.md b/docs/source/algorithms.md
index bd8837b9a..b55b37ca2 100644
--- a/docs/source/algorithms.md
+++ b/docs/source/algorithms.md
@@ -3977,573 +3977,374 @@ and hence imprecise.\
 `AXP (AX-platfofm)` - Very slow and not recommended.
 
 ```{eval-rst}
-.. dropdown:: nevergrad_pso
+.. dropdown::  nevergrad_pso
+
+    **How to use this algorithm:**
 
     .. code-block::
 
-        "nevergrad_pso"
-
-    Minimize a scalar function using the Particle Swarm Optimization algorithm.
-    
-    The Particle Swarm Optimization algorithm was originally proposed by :cite:`Kennedy1995`.The
-    implementation in Nevergrad is based on :cite:`Zambrano2013`.
-    
-    PSO solves an optimization problem by evolving a swarm of particles (candidate solutions) across the
-    search space. Each particle adjusts its position based on its own experience (cognitive component)
-    and the experiences of its neighbors or the swarm (social component), using velocity updates. The
-    algorithm iteratively guides the swarm toward promising regions of the search space.
-
-    - **transform** (str): The transform used to map from PSO optimization space to real space. Options:
-              - "arctan" (default)
-              - "identity"
-              - "gaussian"
-    - **population\_size** (int): The number of particles in the swarm.
-    - **n\_cores** (int): The number of CPU cores to use for parallel computation.
-    - **seed** (int, optional): Random seed for reproducibility.
-    - **stopping\_maxfun** (int, optional): Maximum number of function evaluations.
-    - **inertia** (float):
-      Inertia weight ω. Controls the influence of a particle's previous velocity. Must be less than 1 to
-      avoid divergence. Default is 0.7213475204444817.
-    - **cognitive** (float):
-      Cognitive coefficient :math:`\phi_p`. Controls the influence of a particle’s own best known
-      position. Typical values: 1.0 to 3.0. Default is 1.1931471805599454.
-    - **social** (float):
-      Social coefficient. Denoted by :math:`\phi_g`. Controls the influence of the swarm’s best known
-      position. Typical values: 1.0 to 3.0. Default is 1.1931471805599454.
-    - **quasi\_opp\_init** (bool): Whether to use quasi-opposition initialization. Default is False.
-    - **speed\_quasi\_opp\_init** (bool):
-      Whether to apply quasi-opposition initialization to speed. Default is False.
-    - **special\_speed\_quasi\_opp\_init** (bool):
-      Whether to use special quasi-opposition initialization for speed. Default is False.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_pso(stopping_maxfun=1_000, ...)
+        )
+        
+    or
+        
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_pso",
+          algo_options={"stopping_maxfun": 1_000, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradPSO
+
 ```
 
 ```{eval-rst}
-.. dropdown:: nevergrad_cmaes
+.. dropdown::  nevergrad_cmaes
+
+    **How to use this algorithm:**
 
     .. code-block::
 
-        "nevergrad_cmaes"
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_cmaes(stopping_maxfun=1_000, ...)
+        )
+        
+    or
+        
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_cmaes",
+          algo_options={"stopping_maxfun": 1_000, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradCMAES
 
-    Minimize a scalar function using the Covariance Matrix Adaptation Evolution Strategy (CMA-ES)
-    algorithm.
-    
-    The CMA-ES (Covariance Matrix Adaptation Evolution Strategy) is a state-of-the-art evolutionary
-    algorithm designed for difficult non-linear, non-convex, black-box optimization problems in
-    continuous domains. It is typically applied to unconstrained or bounded optimization problems with
-    dimensionality between 3 and 100. CMA-ES adapts a multivariate normal distribution to approximate
-    the shape of the objective function. It estimates a positive-definite covariance matrix, akin to the
-    inverse Hessian in convex-quadratic problems, but without requiring derivatives or their
-    approximation. Original paper can be accessed at `cma <https://cma-es.github.io/>`_. This
-    implementation is a python wrapper over the original code `pycma <https://cma-es.github.io/>`_.
-
-    - **scale**: Scale of the search.
-    - **elitist**:
-      Whether to switch to elitist mode (also known as (μ,λ)-CMA-ES). In elitist mode, the best point in
-      the population is always retained.
-    - **population\_size**: Population size.
-    - **diagonal**: Use the diagonal version of CMA, which is more efficient for high-dimensional problems.
-    - **high\_speed**: Use a metamodel for recommendation to speed up optimization.
-    - **fast\_cmaes**:
-      Use the fast CMA-ES implementation. Cannot be used with diagonal=True. Produces equivalent results
-      and is preferable for high dimensions or when objective function evaluations are fast.
-    - **random\_init**: If True, initialize the optimizer with random parameters.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **step\_size\_adaptive**:
-      Whether to adapt the step size. Can be a boolean or a string specifying the adaptation strategy.
-    - **CSA\_dampfac**: Damping factor for step size adaptation.
-    - **CMA\_dampsvec\_fade**: Damping rate for step size adaptation.
-    - **CSA\_squared**: Whether to use squared step sizes in updates.
-    - **CMA\_on**: Learning rate for the covariance matrix update.
-    - **CMA\_rankone**: Multiplier for the rank-one update learning rate of the covariance matrix.
-    - **CMA\_rankmu**: Multiplier for the rank-mu update learning rate of the covariance matrix.
-    - **CMA\_cmean**: Learning rate for the mean update.
-    - **CMA\_diagonal\_decoding**: Learning rate for the diagonal update.
-    - **num\_parents**: Number of parents (μ) for recombination.
-    - **CMA\_active**: Whether to use negative updates for the covariance matrix.
-    - **CMA\_mirrormethod**: Strategy for mirror sampling. Possible values are:
-    - **0**: Unconditional mirroring
-    - **1**: Selective mirroring
-    - **2**: Selective mirroring with delay (default)
-    - **CMA\_const\_trace**: How to normalize the trace of the covariance matrix. Valid values are:
-              - False: No normalization
-              - True: Normalize to 1
-              - "arithm": Arithmetic mean normalization
-              - "geom": Geometric mean normalization
-              - "aeig": Arithmetic mean of eigenvalues
-              - "geig": Geometric mean of eigenvalues
-    - **CMA\_diagonal**:
-      Number of iterations to use diagonal covariance matrix before switching to full matrix. If False,
-      always use full matrix.
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **stopping\_maxiter**: Maximum number of iterations before termination.
-    - **stopping\_timeout**: Maximum time in seconds before termination.
-    - **stopping\_cov\_mat\_cond**: Maximum condition number of the covariance matrix before termination.
-    - **convergence\_ftol\_abs**: Absolute tolerance on function value changes for convergence.
-    - **convergence\_ftol\_rel**: Relative tolerance on function value changes for convergence.
-    - **convergence\_xtol\_abs**: Absolute tolerance on parameter changes for convergence.
-    - **convergence\_iter\_noimprove**: Number of iterations without improvement before termination.
-    - **invariant\_path**: Whether evolution path (pc) should be invariant to transformations.
-    - **eval\_final\_mean**: Whether to evaluate the final mean solution.
-    - **seed**: Seed used by the internal random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_oneplusone
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_oneplusone"
-
-    Minimize a scalar function using the One Plus One Evolutionary algorithm from Nevergrad.
-    
-    THe One Plus One evolutionary algorithm iterates to find a set of parameters that minimizes the loss
-    function. It does this by perturbing, or mutating, the parameters from the last iteration (the
-    parent). If the new (child) parameters yield a better result, then the child becomes the new parent
-    whose parameters are perturbed, perhaps more aggressively. If the parent yields a better result, it
-    remains the parent and the next perturbation is less aggressive. Originally proposed by
-    :cite:`Rechenberg1973`. The implementation in Nevergrad is based on the one-fifth adaptation rule,
-    going back to :cite:`Schumer1968.
-
-    - **noise\_handling**: Method for handling the noise, can be
-          - "random": A random point is reevaluated regularly using the one-fifth adaptation rule.
-          - "optimistic": The best optimistic point is reevaluated regularly, embracing optimism in the face of uncertainty.
-          - A float coefficient can be provided to tune the regularity of these reevaluations (default is 0.05). Eg: with 0.05, each evaluation has a 5% chance (i.e., 1 in 20) of being repeated (i.e., the same candidate solution is reevaluated to better estimate its performance). (Default: `None`).
-    - **n\_cores**: Number of cores to use.
-
-      stopping.maxfun: Maximum number of function evaluations.
-    - **mutation**: Type of mutation to apply. Available options are (Default: `"gaussian"`).
-          - "gaussian": Standard mutation by adding a Gaussian random variable (with progressive widening) to the best pessimistic point.
-          - "cauchy": Same as Gaussian but using a Cauchy distribution.
-          - "discrete": Mutates a randomly drawn variable (mutation occurs with probability 1/d in d dimensions, hence ~1 variable per mutation).
-          - "discreteBSO": Follows brainstorm optimization by gradually decreasing mutation rate from 1 to 1/d.
-          - "fastga": Fast Genetic Algorithm mutations from the current best.
-          - "doublefastga": Double-FastGA mutations from the current best :cite:`doerr2017`.
-          - "rls": Randomized Local Search — mutates one and only one variable.
-          - "portfolio": Random number of mutated bits, known as uniform mixing :cite:`dang2016`.
-          - "lengler": Mutation rate is a function of dimension and iteration index.
-          - "lengler{2|3|half|fourth}": Variants of the Lengler mutation rate adaptation.
-    - **sparse**: Whether to apply random mutations that set variables to zero. Default is `False`.
-    - **smoother**: Whether to suggest smooth mutations. Default is `False`.
-    - **annealing**:
-      Annealing schedule to apply to mutation amplitude or temperature-based control. Options are:
-          - "none": No annealing is applied.
-          - "Exp0.9": Exponential decay with rate 0.9.
-          - "Exp0.99": Exponential decay with rate 0.99.
-          - "Exp0.9Auto": Exponential decay with rate 0.9, auto-scaled based on problem horizon.
-          - "Lin100.0": Linear decay from 1 to 0 over 100 iterations.
-          - "Lin1.0": Linear decay from 1 to 0 over 1 iteration.
-          - "LinAuto": Linearly decaying annealing automatically scaled to the problem horizon. Default is `"none"`.
-    - **super\_radii**:
-      Whether to apply extended radii beyond standard bounds for candidate generation, enabling broader
-      exploration. Default is `False`.
-    - **roulette\_size**:
-      Size of the roulette wheel used for selection in the evolutionary process. Affects the sampling
-      diversity from past candidates. (Default: `64`)
-    - **antismooth**:
-      Degree of anti-smoothing applied to prevent premature convergence in smooth landscapes. This alters
-      the landscape by penalizing overly smooth improvements. (Default: `4`)
-    - **crossover**: Whether to include a genetic crossover step every other iteration. Default is `False`.
-    - **crossover\_type**:
-      Method used for genetic crossover between individuals in the population. Available options (Default: `"none"`):
-          - "none": No crossover is applied.
-          - "rand": Randomized selection of crossover point.
-          - "max": Crossover at the point with maximum fitness gain.
-          - "min": Crossover at the point with minimum fitness gain.
-          - "onepoint": One-point crossover, splitting the genome at a single random point.
-          - "twopoint": Two-point crossover, splitting the genome at two points and exchanging the middle section.
-    - **tabu\_length**:
-      Length of the tabu list used to prevent revisiting recently evaluated candidates in local search
-      strategies. Helps in escaping local minima. (Default: `1000`)
-    - **rotation**:
-      Whether to apply rotational transformations to the search space, promoting invariance to axis-
-      aligned structures and enhancing search performance in rotated coordinate systems. (Default:
-      `False`)
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_oneplusone(stopping_maxfun=1_000, ...)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_oneplusone",
+          algo_options={"stopping_maxfun": 1_000, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradOnePlusOne
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_de
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_de"
-
-    Minimize a scalar function using the Differential Evolution optimizer from Nevergrad.
-    
-    Differential Evolution is typically used for continuous optimization. It uses differences between
-    points in the population for performing mutations in fruitful directions; it is therefore a kind of
-    covariance adaptation without any explicit covariance, making it very fast in high dimensions.
-
-    - **initialization**:
-      Algorithm/distribution used for initialization. Can be one of: "parametrization" (uses
-      parametrization's sample method), "LHS" (Latin Hypercube Sampling), "QR" (Quasi-Random), "QO"
-      (Quasi-Orthogonal), or "SO" (Sobol sequence).
-    - **scale**: Scale of random component of updates. Can be a float or a string.
-    - **recommendation**: Criterion for selecting the best point to recommend.
-    - **Options**: "pessimistic", "optimistic", "mean", or "noisy".
-    - **crossover**: Crossover rate or strategy. Can be:
-              - float: Fixed crossover rate
-              - "dimension": 1/dimension
-              - "random": Random uniform rate per iteration
-              - "onepoint": One-point crossover
-              - "twopoints": Two-points crossover
-              - "rotated_twopoints": Rotated two-points crossover
-              - "parametrization": Use parametrization's recombine method
-    - **F1**: Differential weight #1 (scaling factor).
-    - **F2**: Differential weight #2 (scaling factor).
-    - **popsize**: Population size. Can be an integer or one of:
-              - "standard": max(num_workers, 30)
-              - "dimension": max(num_workers, 30, dimension + 1)
-              - "large": max(num_workers, 30, 7 * dimension)
-    - **high\_speed**: If True, uses a metamodel for recommendations to speed up optimization.
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_de(population_size="large", ...)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_de",
+          algo_options={"population_size": "large", ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradDifferentialEvolution
 ```
 
 ```{eval-rst}
-.. dropdown:: nevergrad_bo
+.. dropdown::  nevergrad_bo
+
+    .. note::
+
+        Using this optimizer requires the `bayes-optim` package to be installed as well.
+        This can be done with `pip install bayes-optim`.
+
+    **How to use this algorithm:**
+
+    .. code-block::
+
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_bo(stopping_maxfun=1_000, ...)
+        )
+
+    or
 
     .. code-block::
 
-        "nevergrad_bo"
-
-    Minimize a scalar function using the Bayes Optim algorithm. BO and PCA-BO algorithms from the
-    `bayes_optim <https://github.com/wangronin/Bayesian-Optimization>`_ package PCA-BO (Principal
-    Component Analysis for Bayesian Optimization) is a dimensionality reduction technique for black-box
-    optimization. It applies PCA to the input space before performing Bayesian optimization, improving
-    efficiency in high dimensions by focusing on directions of greatest variance. This helps concentrate
-    search in informative subspaces and reduce sample complexity. :cite:`bayesoptimimpl`.
-
-    - **init\_budget**: Number of initialization algorithm steps.
-    - **pca**: Whether to use the PCA transformation, defining PCA-BO rather than standard BO.
-    - **n\_components**:
-      Number of principal axes in feature space representing directions of maximum variance in the data.
-      Represents the percentage of explained variance (e.g., 0.95 means 95% variance retained).
-    - **prop\_doe\_factor**:
-      Percentage of the initial budget used for DoE, potentially overriding `init_budget`. For
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        om.minimize(
+          ...,
+          algorithm="nevergrad_bo",
+          algo_options={"stopping_maxfun": 1_000, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradBayesOptim
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_emna
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_emna"
-
-    Minimize a scalar function using the Estimation of Multivariate Normal Algorithm.
-    
-    Estimation of Multivariate Normal Algorithm (EMNA), a distribution-based evolutionary algorithm that
-    models the search space using a multivariate Gaussian. EMNA learns the full covariance matrix of the
-    Gaussian sampling distribution, resulting in a cubic time complexity w.r.t. each sampling. It is
-    highly recommended to first attempt other more advanced optimization methods for LBO. See
-    :cite:`emnaimpl`. This algorithm is quite efficient in a parallel setting, i.e. when the population
-    size is large.
-
-    - **isotropic**:
-      If True, uses an isotropic (identity covariance) Gaussian. If False, uses a separable (diagonal
-      covariance) Gaussian for greater flexibility in anisotropic landscapes.
-    - **noise\_handling**:
-      If True, returns the best individual found. If False (recommended for noisy problems), returns the
-      average of the final population to reduce noise.
-    - **population\_size\_adaptation**:
-      If True, the population size is adjusted automatically based on the optimization landscape and noise
-      level.
-    - **initial\_popsize**: Initial population size. Default: 4 x dimension..
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_emna(noise_handling=False, ...)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_emna",
+          algo_options={"noise_handling": False, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradEMNA
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_cga
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_cga"
-
-    Minimize a scalar function using the Compact Genetic Algorithm.
-    
-    The Compact Genetic Algorithm (cGA) is a memory-efficient genetic algorithm that represents the
-    population as a probability vector over gene values. It simulates the order-one behavior of a simple
-    GA with uniform crossover, updating probabilities instead of maintaining an explicit population. cGA
-    processes each gene independently and is well-suited for large or constrained environments. For
-    details see :cite:`cgaimpl`.
-
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_cga(stopping_maxfun=10_000)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_cga",
+          algo_options={"stopping_maxfun": 10_000}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradCGA
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_eda
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_eda"
-
-    Minimize a scalar function using the Estimation of distribution algorithm.
-    
-    Estimation of Distribution Algorithms (EDAs) optimize by building and sampling a probabilistic model
-    of promising solutions. Instead of using traditional variation operators like crossover or mutation,
-    EDAs update a distribution based on selected individuals and sample new candidates from it. This
-    allows efficient exploration of complex or noisy search spaces. In short, EDAs typically do not
-    directly evolve populations of search points but build probabilistic models of promising solutions
-    by repeatedly sampling and selecting points from the underlying search space. Refer :cite:`edaimpl`.
-
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_eda(stopping_maxfun=10_000)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_eda",
+          algo_options={"stopping_maxfun": 10_000}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradEDA
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_tbpsa
 
+    **How to use this algorithm:**
+
+    .. code-block::
+
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_tbpsa(noise_handling=False, ...)
+        )
+
+    or
+
     .. code-block::
 
-        "nevergrad_tbpsa"
-
-    Minimize a scalar function using the Test-based population size adaptation algorithm.
-    
-    TBPSA adapts population size based on fitness trend detection using linear regression. If no
-    significant improvement is found (via hypothesis testing), the population size is increased to
-    improve robustness in noisy settings. This method performs the best in many noisy optimization
-    problems, even in large dimensions. For more details, refer :cite:`tbpsaimpl`
-
-    - **noise\_handling**:
-      If True, returns the best individual seen so far. If False (recommended for noisy problems), returns
-      the average of the final population to reduce the effect of noise.
-    - **initial\_popsize**: Initial population size. If not specified, defaults to 4 x dimension.
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        om.minimize(
+          ...,
+          algorithm="nevergrad_tbpsa",
+          algo_options={"noise_handling": False, ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradTBPSA
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_randomsearch
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_randomsearch"
-
-    Minimize a scalar function using the Random Search algorithm.
-    
-    This is a one-shot optimization method, provides random suggestions.
-
-    - **middle\_point**:
-      Enforces that the first suggested point (ask) is the zero vector. i.e we add (0,0,...,0) as a first
-      point.
-    - **opposition\_mode**: Symmetrizes exploration with respect to the center.
-              - "opposite": enables full symmetry by always evaluating mirrored points.
-              - "quasi": applies randomized symmetry (less strict, more exploratory).
-              - None: disables any symmetric mirroring in the sampling process.
-    - **sampler**:
-              - "parametrization": uses the default sample() method of the parametrization, which samples uniformly within bounds or from a Gaussian.
-              - "gaussian": samples from a standard Gaussian distribution.
-              - "cauchy": uses a Cauchy distribution instead of Gaussian.
-    - **scale**: Scalar used to multiply suggested point values, or a string mode:
-              - "random": uses a randomized pattern for the scale.
-              - "auto": sigma = (1 + log(budget)) / (4 * log(dimension)); adjusts scale based on problem size.
-              - "autotune": sigma = sqrt(log(budget) / dimension); alternative auto-scaling based on budget and dimensionality.
-    - **recommendation\_rule**: Specifies how the final recommendation is chosen.
-              - "average_of_best": returns the average of top-performing candidates.
-              - "pessimistic": selects the pessimistic best (default);
-              - "average_of_exp_best": uses an exponential moving average of the best points.
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_randomsearch(opposition_mode="quasi", ...)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_randomsearch",
+          algo_options={"opposition_mode": "quasi", ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradRandomSearch
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_samplingsearch
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_samplingsearch"
-
-    Minimize a scalar function using SamplingSearch.
-    
-    This is a one-shot optimization method, but better than random search by ensuring more uniformity.
-
-    - **sampler**: Choice of the low-discrepancy sampler used for initial points.
-              - "Halton": deterministic, well-spaced sequences
-              - "Hammersley": similar to Halton but more uniform in low dimension
-              - "LHS": Latin Hypercube Sampling; ensures coverage along each axis
-    - **scrambled**:
-      If True,  Adds scrambling to the search; much better in high dimension and rarely worse than the
-      original search.
-    - **middle\_point**:
-      If True, the first suggested point is the zero vector. Useful for initializing at the center of the
-      search space.
-    - **cauchy**:
-      If True, uses the inverse Cauchy distribution instead of Gaussian when projecting samples to real-
-      valued space (especially when no box bounds exist).
-    - **scale**: A float multiplier or "random".
-              - float: directly scales all generated points
-              - "random": uses a randomized scaling pattern for increased diversity
-    - **rescaled**: If True or a specific mode, rescales the sampling pattern.
-              - Ensures coverage of boundaries and may apply adaptive scaling
-              - Useful when original scale is too narrow or biased
-    - **recommendation\_rule**: How the final recommendation is chosen.
-              - "average_of_best": mean of the best-performing points
-              - "pessimistic": selects the point with best worst-case value (default)
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided. Notes
-      -----
-      - Halton is a low quality sampling method when the dimension is high; it is usually better to use Halton with scrambling.
-      - When the budget is known in advance, it is also better to replace Halton by Hammersley.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_samplingsearch(sampler="Hammersley", scrambled=True)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_samplingsearch",
+          algo_options={"sampler": "Hammersley", "scrambled": True}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradSamplingSearch
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_NGOpt
 
+    **How to use this algorithm:**
+
+    .. code-block::
+
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_NGOpt(optimizer="NGOptRW", ...)
+        )
+
+    or
+
     .. code-block::
 
-        "nevergrad_NGOpt"
-
-    Minimize a scalar function using a Meta Optimizer from Nevergrad. Each meta optimizer combines
-    multiples optimizers to solve a problem.
-
-    - **optimizer**: One of
-              - NGOpt
-              - NGOpt4
-              - NGOpt8
-              - NGOpt10
-              - NGOpt12
-              - NGOpt13
-              - NGOpt14
-              - NGOpt15
-              - NGOpt16
-              - NGOpt21
-              - NGOpt36
-              - NGOpt38
-              - NGOpt39
-              - NGOptRW
-              - NGOptF
-              - NGOptF2
-              - NGOptF3
-              - NGOptF5
-              - NgIoh2
-              - NgIoh3
-              - NgIoh4
-              - NgIoh5
-              - NgIoh6
-              - NgIoh7
-              - NgIoh8
-              - NgIoh9
-              - NgIoh10
-              - NgIoh11
-              - NgIoh12
-              - NgIoh13
-              - NgIoh14
-              - NgIoh15
-              - NgIoh16
-              - NgIoh17
-              - NgIoh18
-              - NgIoh19
-              - NgIoh20
-              - NgIoh21
-              - NgIoh12b
-              - NgIoh13b
-              - NgIoh14b
-              - NgIoh15b
-              - NgIohRW2
-              - NgIohTuned
-              - NgDS
-              - NgDS2
-              - NGDSRW
-              - NGO
-              - CSEC
-              - CSEC10
-              - CSEC11
-              - Wiz
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        om.minimize(
+          ...,
+          algorithm="nevergrad_NGOpt",
+          algo_options={"optimizer": "NGOptRW", ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradNGOpt
 ```
 
 ```{eval-rst}
 .. dropdown:: nevergrad_meta
 
+    **How to use this algorithm:**
+
     .. code-block::
 
-        "nevergrad_meta"
-
-    Minimize a scalar function using a Meta Optimizer from Nevergrad. Utilizes a combination of local
-    and global optimizers to find the best solution. Local optimizers like BFGS are wrappers over scipy
-    implementations. Each meta optimizer combines multiples optimizers to solve a problem.
-
-    - **optimizer**: One of
-              - MultiBFGSPlus
-              - LogMultiBFGSPlus
-              - SqrtMultiBFGSPlus
-              - MultiCobylaPlus
-              - MultiSQPPlus
-              - BFGSCMAPlus
-              - LogBFGSCMAPlus
-              - SqrtBFGSCMAPlus
-              - SQPCMAPlus
-              - LogSQPCMAPlus
-              - SqrtSQPCMAPlus
-              - MultiBFGS
-              - LogMultiBFGS
-              - SqrtMultiBFGS
-              - MultiCobyla
-              - ForceMultiCobyla
-              - MultiSQP
-              - BFGSCMA
-              - LogBFGSCMA
-              - SqrtBFGSCMA
-              - SQPCMA
-              - LogSQPCMA
-              - SqrtSQPCMA
-              - FSQPCMA
-              - F2SQPCMA
-              - F3SQPCMA
-              - MultiDiscrete
-              - CMandAS2
-              - CMandAS3
-              - MetaCMA
-              - CMA
-              - PCEDA
-              - MPCEDA
-              - MEDA
-              - NoisyBandit
-              - Shiwa
-              - Carola3
-    - **stopping\_maxfun**: Maximum number of function evaluations before termination.
-    - **n\_cores**: Number of cores to use for parallel function evaluation.
-    - **seed**: Seed for the random number generator for reproducibility.
-    - **sigma**:
-      Standard deviation for sampling initial population from N(0, σ²) in case bounds are not provided.
+        import optimagic as om
+        om.minimize(
+          ...,
+          algorithm=om.algos.nevergrad_meta(optimizer="BFGSCMAPlus", ...)
+        )
+
+    or
+
+    .. code-block::
+
+        om.minimize(
+          ...,
+          algorithm="nevergrad_meta",
+          algo_options={"optimizer": "BFGSCMAPlus", ...}
+        )
+
+    **Description and available options:**
+
+    .. autoclass:: optimagic.optimizers.nevergrad_optimizers.NevergradMeta
 ```
 
 ## Bayesian Optimization
@@ -4552,6 +4353,8 @@ We wrap the
 [BayesianOptimization](https://github.com/bayesian-optimization/BayesianOptimization)
 package. To use it, you need to have
 [bayesian-optimization](https://pypi.org/project/bayesian-optimization/) installed.
+Note: This optimizer requires `bayesian_optimization > 2.0.0` to be installed which is
+incompatible with `nevergrad > 1.0.3`.
 
 ```{eval-rst}
 .. dropdown::  bayes_opt
@@ -4627,80 +4430,6 @@ package. To use it, you need to have
     - **n_restarts** (int): Number of times to restart the optimizer. Default is 1.
 ```
 
-```{eval-rst}
-.. dropdown:: nevergrad_oneplusone
-
-    .. code-block::
-
-        "nevergrad_oneplusone"
-
-    Minimize a scalar function using the One Plus One Evolutionary algorithm from Nevergrad.
-    
-    THe One Plus One evolutionary algorithm iterates to find a set of parameters that minimizes the loss
-    function. It does this by perturbing, or mutating, the parameters from the last iteration (the
-    parent). If the new (child) parameters yield a better result, then the child becomes the new parent
-    whose parameters are perturbed, perhaps more aggressively. If the parent yields a better result, it
-    remains the parent and the next perturbation is less aggressive. Originally proposed by
-    :cite:`Rechenberg1973`. The implementation in Nevergrad is based on the one-fifth adaptation rule,
-    going back to :cite:`Schumer1968.
-
-    - **noise\_handling**: Method for handling the noise, can be
-          - "random": A random point is reevaluated regularly using the one-fifth adaptation rule.
-          - "optimistic": The best optimistic point is reevaluated regularly, embracing optimism in the face of uncertainty.
-          - A float coefficient can be provided to tune the regularity of these reevaluations (default is 0.05). Eg: with 0.05, each evaluation has a 5% chance (i.e., 1 in 20) of being repeated (i.e., the same candidate solution is reevaluated to better estimate its performance). (Default: `None`).
-    - **n\_cores**: Number of cores to use.
-
-    - **stopping.maxfun**: Maximum number of function evaluations.
-    - **mutation**: Type of mutation to apply. Available options are (Default: `"gaussian"`).
-          - "gaussian": Standard mutation by adding a Gaussian random variable (with progressive widening) to the best pessimistic point.
-          - "cauchy": Same as Gaussian but using a Cauchy distribution.
-          - "discrete": Mutates a randomly drawn variable (mutation occurs with probability 1/d in d dimensions, hence ~1 variable per mutation).
-          - "discreteBSO": Follows brainstorm optimization by gradually decreasing mutation rate from 1 to 1/d.
-          - "fastga": Fast Genetic Algorithm mutations from the current best.
-          - "doublefastga": Double-FastGA mutations from the current best :cite:`doerr2017`.
-          - "rls": Randomized Local Search — mutates one and only one variable.
-          - "portfolio": Random number of mutated bits, known as uniform mixing :cite:`dang2016`.
-          - "lengler": Mutation rate is a function of dimension and iteration index.
-          - "lengler{2|3|half|fourth}": Variants of the Lengler mutation rate adaptation.
-    - **sparse**: Whether to apply random mutations that set variables to zero. Default is `False`.
-    - **smoother**: Whether to suggest smooth mutations. Default is `False`.
-    - **annealing**:
-      Annealing schedule to apply to mutation amplitude or temperature-based control. Options are:
-          - "none": No annealing is applied.
-          - "Exp0.9": Exponential decay with rate 0.9.
-          - "Exp0.99": Exponential decay with rate 0.99.
-          - "Exp0.9Auto": Exponential decay with rate 0.9, auto-scaled based on problem horizon.
-          - "Lin100.0": Linear decay from 1 to 0 over 100 iterations.
-          - "Lin1.0": Linear decay from 1 to 0 over 1 iteration.
-          - "LinAuto": Linearly decaying annealing automatically scaled to the problem horizon. Default is `"none"`.
-    - **super\_radii**:
-      Whether to apply extended radii beyond standard bounds for candidate generation, enabling broader
-      exploration. Default is `False`.
-    - **roulette\_size**:
-      Size of the roulette wheel used for selection in the evolutionary process. Affects the sampling
-      diversity from past candidates. (Default: `64`)
-    - **antismooth**:
-      Degree of anti-smoothing applied to prevent premature convergence in smooth landscapes. This alters
-      the landscape by penalizing overly smooth improvements. (Default: `4`)
-    - **crossover**: Whether to include a genetic crossover step every other iteration. Default is `False`.
-    - **crossover\_type**:
-      Method used for genetic crossover between individuals in the population. Available options (Default: `"none"`):
-          - "none": No crossover is applied.
-          - "rand": Randomized selection of crossover point.
-          - "max": Crossover at the point with maximum fitness gain.
-          - "min": Crossover at the point with minimum fitness gain.
-          - "onepoint": One-point crossover, splitting the genome at a single random point.
-          - "twopoint": Two-point crossover, splitting the genome at two points and exchanging the middle section.
-    - **tabu\_length**:
-      Length of the tabu list used to prevent revisiting recently evaluated candidates in local search
-      strategies. Helps in escaping local minima. (Default: `1000`)
-    - **rotation**:
-      Whether to apply rotational transformations to the search space, promoting invariance to axis-
-      aligned structures and enhancing search performance in rotated coordinate systems. (Default:
-      `False`)
-    - **seed**: Seed for the random number generator for reproducibility.
-```
-
 ## References
 
 ```{eval-rst}
diff --git a/docs/source/how_to/how_to_start_parameters.md b/docs/source/how_to/how_to_start_parameters.md
index fc5a031e9..0c13ba6bf 100644
--- a/docs/source/how_to/how_to_start_parameters.md
+++ b/docs/source/how_to/how_to_start_parameters.md
@@ -14,125 +14,120 @@ advantages and drawbacks of each of them.
 Again, we use the simple `sphere` function you know from other tutorials as an example.
 
 ```{eval-rst}
-.. tabbed:: Array
 
-    A frequent choice of ``params`` is a one-dimensional numpy array. This is
-    because one-dimensional numpy arrays are all that is supported by most optimizer
-    libraries.
+.. tab-set::
+    .. tab-item:: Array
 
-    In our opinion, it is rarely a good choice to represent parameters as flat numpy arrays
-    and then access individual parameters or sclices by positions. The only exception
-    are simple optimization problems with very-fast-to-evaluate criterion functions where
-    any overhead must be avoided.
+        A frequent choice of ``params`` is a one-dimensional numpy array. This is
+        because one-dimensional numpy arrays are all that is supported by most optimizer
+        libraries.
 
-    If you still want to use one-dimensional numpy arrays, here is how:
+        In our opinion, it is rarely a good choice to represent parameters as flat numpy arrays
+        and then access individual parameters or sclices by positions. The only exception
+        are simple optimization problems with very-fast-to-evaluate criterion functions where
+        any overhead must be avoided.
 
-    .. code-block:: python
+        If you still want to use one-dimensional numpy arrays, here is how:
 
-        import optimagic as om
+        .. code-block:: python
 
+            import optimagic as om
 
-        def sphere(params):
-            return params @ params
 
+            def sphere(params):
+                return params @ params
 
-        om.minimize(
-            fun=sphere,
-            params=np.arange(3),
-            algorithm="scipy_lbfgsb",
-        )
 
-```
+            om.minimize(
+                fun=sphere,
+                params=np.arange(3),
+                algorithm="scipy_lbfgsb",
+            )
 
-```{eval-rst}
-.. tabbed:: DataFrame
+    .. tab-item:: DataFrame
 
-    Originally, pandas DataFrames were the mandatory format for ``params`` in optimagic.
-    They are still highly recommended and have a few special features. For example,
-    they allow to bundle information on start parameters and bounds together into one
-    data structure.
+        Originally, pandas DataFrames were the mandatory format for ``params`` in optimagic.
+        They are still highly recommended and have a few special features. For example,
+        they allow to bundle information on start parameters and bounds together into one
+        data structure.
 
-    Let's look at an example where we do that:
+        Let's look at an example where we do that:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        def sphere(params):
-            return (params["value"] ** 2).sum()
+            def sphere(params):
+                return (params["value"] ** 2).sum()
 
 
-        params = pd.DataFrame(
-            data={"value": [1, 2, 3], "lower_bound": [-np.inf, 1.5, 0]},
-            index=["a", "b", "c"],
-        )
+            params = pd.DataFrame(
+                data={"value": [1, 2, 3], "lower_bound": [-np.inf, 1.5, 0]},
+                index=["a", "b", "c"],
+            )
 
-        om.minimize(
-            fun=sphere,
-            params=params,
-            algorithm="scipy_lbfgsb",
-        )
+            om.minimize(
+                fun=sphere,
+                params=params,
+                algorithm="scipy_lbfgsb",
+            )
 
-    DataFrames have many advantages:
+        DataFrames have many advantages:
 
-    - It is easy to select single parameters or groups of parameters or work with
-      the entire parameter vector. Especially, if you use a well designed MultiIndex.
-    - It is very easy to produce publication quality LaTeX tables from them.
-    - If you have nested models, you can easily update the parameter vector of a larger
-      model with the values from a smaller one (e.g. to get good start parameters).
-    - You can bundle information on bounds and values in one place.
-    - It is easy to compare two params vectors for equality.
+        - It is easy to select single parameters or groups of parameters or work with
+        the entire parameter vector. Especially, if you use a well designed MultiIndex.
+        - It is very easy to produce publication quality LaTeX tables from them.
+        - If you have nested models, you can easily update the parameter vector of a larger
+        model with the values from a smaller one (e.g. to get good start parameters).
+        - You can bundle information on bounds and values in one place.
+        - It is easy to compare two params vectors for equality.
 
 
-    If you are sure you won't have bounds on your parameter, you can also use a
-    pandas.Series instead of a pandas.DataFrame.
+        If you are sure you won't have bounds on your parameter, you can also use a
+        pandas.Series instead of a pandas.DataFrame.
 
-    A drawback of DataFrames is that they are not JAX compatible. Another one is that
-    they are a bit slower than numpy arrays.
+        A drawback of DataFrames is that they are not JAX compatible. Another one is that
+        they are a bit slower than numpy arrays.
 
 
-```
+    .. tab-item:: Dict
 
-```{eval-rst}
-.. tabbed:: Dict
+        ``params`` can also be a (nested) dictionary containing all of the above and more.
 
-    ``params`` can also be a (nested) dictionary containing all of the above and more.
+        .. code-block:: python
 
-    .. code-block:: python
+            def sphere(params):
+                return params["a"] ** 2 + params["b"] ** 2 + (params["c"] ** 2).sum()
 
-        def sphere(params):
-            return params["a"] ** 2 + params["b"] ** 2 + (params["c"] ** 2).sum()
 
+            res = om.minimize(
+                fun=sphere,
+                params={"a": 0, "b": 1, "c": pd.Series([2, 3, 4])},
+                algorithm="scipy_neldermead",
+            )
 
-        res = om.minimize(
-            fun=sphere,
-            params={"a": 0, "b": 1, "c": pd.Series([2, 3, 4])},
-            algorithm="scipy_neldermead",
-        )
+        Dictionarys of arrays are ideal if you want to do vectorized computations with
+        groups of parameters. They are also a good choice if you calculate derivatives
+        with JAX.
 
-    Dictionarys of arrays are ideal if you want to do vectorized computations with
-    groups of parameters. They are also a good choice if you calculate derivatives
-    with JAX.
+        While optimagic won't stop you, don't go too far! Having parameters in very deeply
+        nested dictionaries makes it hard to visualize results and/or even to compare two
+        estimation results.
 
-    While optimagic won't stop you, don't go too far! Having parameters in very deeply
-    nested dictionaries makes it hard to visualize results and/or even to compare two
-    estimation results.
 
-```
+    .. tab-item:: Scalar
 
-```{eval-rst}
-.. tabbed:: Scalar
+        If you have a one-dimensional optimization problem, the natural way to represent
+        your params is a float:
 
-    If you have a one-dimensional optimization problem, the natural way to represent
-    your params is a float:
+        .. code-block:: python
 
-    .. code-block:: python
+            def sphere(params):
+                return params**2
 
-        def sphere(params):
-            return params**2
 
+            om.minimize(
+                fun=sphere,
+                params=3,
+                algorithm="scipy_lbfgsb",
+            )
 
-        om.minimize(
-            fun=sphere,
-            params=3,
-            algorithm="scipy_lbfgsb",
-        )
 ```
diff --git a/docs/source/refs.bib b/docs/source/refs.bib
index f8005d2e9..298b813ff 100644
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -964,8 +964,8 @@ @inproceedings{tbpsaimpl
 year = {2016},
 month = {09},
 pages = {},
-title = {Evolution under Strong Noise: A Self-Adaptive Evolution Strategy Can Reach the Lower Performance Bound - the pcCMSA-ES},
-volume = {9921},
+title     = {Evolution under Strong Noise: A Self-Adaptive Evolution Strategy Can Reach the Lower Performance Bound - the pcCMSA-ES},
+booktitle = {Parallel Problem Solving from Nature -- PPSN XIII},volume = {9921},
 isbn = {9783319458229},
 doi = {10.1007/978-3-319-45823-6_3}
 }
@@ -1037,6 +1037,7 @@ @book{emnaimpl
 pages = {},
 title = {Estimation of Distribution Algorithms: A New Tool for Evolutionary Computation},
 isbn = {9781461356042},
+publisher = {Springer},
 journal = {Genetic algorithms and evolutionary computation ; 2},
 doi = {10.1007/978-1-4615-1539-5}
 }
diff --git a/pyproject.toml b/pyproject.toml
index c74752252..58730bd0f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -380,6 +380,7 @@ module = [
     "pdbp",
     "iminuit",
     "nevergrad",
+    "nevergrad.optimization.base.ConfiguredOptimizer",
     "yaml",
   ]
 ignore_missing_imports = true
diff --git a/src/optimagic/config.py b/src/optimagic/config.py
index ce6cd4d60..3171a4195 100644
--- a/src/optimagic/config.py
+++ b/src/optimagic/config.py
@@ -38,8 +38,11 @@ def _is_installed(module_name: str) -> bool:
 IS_NUMBA_INSTALLED = _is_installed("numba")
 IS_IMINUIT_INSTALLED = _is_installed("iminuit")
 IS_NEVERGRAD_INSTALLED = _is_installed("nevergrad")
-IS_BAYESOPT_INSTALLED = _is_installed("bayes_opt")
-
+IS_BAYESOPTIM_INSTALLED = _is_installed("bayes-optim")
+IS_BAYESOPT_INSTALLED_AND_VERSION_NEWER_THAN_2 = (
+    _is_installed("bayes_opt")
+    and importlib.metadata.version("bayesian_optimization") > "2.0.0"
+)
 
 # ======================================================================================
 # Check if pandas version is newer or equal to version 2.1.0
diff --git a/src/optimagic/optimizers/bayesian_optimizer.py b/src/optimagic/optimizers/bayesian_optimizer.py
index 3de716a7f..93337f586 100644
--- a/src/optimagic/optimizers/bayesian_optimizer.py
+++ b/src/optimagic/optimizers/bayesian_optimizer.py
@@ -10,7 +10,7 @@
 from scipy.optimize import NonlinearConstraint
 
 from optimagic import mark
-from optimagic.config import IS_BAYESOPT_INSTALLED
+from optimagic.config import IS_BAYESOPT_INSTALLED_AND_VERSION_NEWER_THAN_2
 from optimagic.exceptions import NotInstalledError
 from optimagic.optimization.algo_options import N_RESTARTS
 from optimagic.optimization.algorithm import Algorithm, InternalOptimizeResult
@@ -35,7 +35,7 @@
 @mark.minimizer(
     name="bayes_opt",
     solver_type=AggregationLevel.SCALAR,
-    is_available=IS_BAYESOPT_INSTALLED,
+    is_available=IS_BAYESOPT_INSTALLED_AND_VERSION_NEWER_THAN_2,
     is_global=True,
     needs_jac=False,
     needs_hess=False,
@@ -72,7 +72,7 @@ class BayesOpt(Algorithm):
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
     ) -> InternalOptimizeResult:
-        if not IS_BAYESOPT_INSTALLED:
+        if not IS_BAYESOPT_INSTALLED_AND_VERSION_NEWER_THAN_2:
             raise NotInstalledError(
                 "To use the 'bayes_opt' optimizer you need to install bayes_opt. "
                 "Use 'pip install bayesian-optimization'. "
diff --git a/src/optimagic/optimizers/nevergrad_optimizers.py b/src/optimagic/optimizers/nevergrad_optimizers.py
index 16166b0a9..258b5e39a 100644
--- a/src/optimagic/optimizers/nevergrad_optimizers.py
+++ b/src/optimagic/optimizers/nevergrad_optimizers.py
@@ -1,5 +1,7 @@
 """Implement optimizers from the nevergrad package."""
 
+from __future__ import annotations
+
 import math
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Literal
@@ -8,7 +10,7 @@
 from numpy.typing import NDArray
 
 from optimagic import mark
-from optimagic.config import IS_NEVERGRAD_INSTALLED
+from optimagic.config import IS_BAYESOPTIM_INSTALLED, IS_NEVERGRAD_INSTALLED
 from optimagic.exceptions import NotInstalledError
 from optimagic.optimization.algo_options import (
     CONVERGENCE_FTOL_ABS,
@@ -30,7 +32,7 @@
 )
 
 if TYPE_CHECKING:
-    import nevergrad as ng
+    from nevergrad.optimization.base import ConfiguredOptimizer
 
 
 NEVERGRAD_NOT_INSTALLED_ERROR = (
@@ -58,18 +60,84 @@
 )
 @dataclass(frozen=True)
 class NevergradPSO(Algorithm):
+    """Minimize a scalar function using the Particle Swarm Optimization algorithm.
+
+    The Particle Swarm Optimization algorithm was originally proposed by
+    :cite:`Kennedy1995`.The implementation in Nevergrad is based on
+    :cite:`Zambrano2013`.
+
+    PSO solves an optimization problem by evolving a swarm of particles
+    (candidate solutions) across the search space. Each particle adjusts its position
+    based on its own experience (cognitive component) and the experiences
+    of its neighbors or the swarm (social component), using velocity updates. The
+    algorithm iteratively guides the swarm toward promising regions of the search
+    space.
+
+    """
+
     transform: Literal["arctan", "gaussian", "identity"] = "arctan"
+    """The transform used to map from PSO optimization space to real space."""
+
     population_size: int | None = None
+    """The number of particles in the swarm."""
+
     n_cores: int = 1
+    """The number of CPU cores to use for parallel computation."""
+
     seed: int | None = None
+    """Random seed for reproducibility."""
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations."""
+
     inertia: float = 0.5 / math.log(2.0)
+    r"""Inertia weight ω.
+
+    Controls the influence of a particle's previous velocity. Must be less than 1 to
+    avoid divergence.
+
+    """
+
     cognitive: float = 0.5 + math.log(2.0)
+    r"""Cognitive coefficient :math:`\phi_p`.
+
+    Controls the influence of a particle's own best known position. Typical values: 1.0
+    to 3.0.
+
+    """
+
     social: float = 0.5 + math.log(2.0)
+    r"""Social coefficient.
+
+    Denoted by :math:`\phi_g`. Controls the influence of the swarm's best known
+    position. Typical values: 1.0 to 3.0.
+
+    """
+
     quasi_opp_init: bool = False
+    """Whether to use quasi-opposition initialization.
+
+    Default is False.
+
+    """
+
     speed_quasi_opp_init: bool = False
+    """Whether to apply quasi-opposition initialization to speed.
+
+    Default is False.
+
+    """
+
     special_speed_quasi_opp_init: bool = False
+    """Whether to use special quasi-opposition initialization for speed.
+
+    Default is False.
+
+    """
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²) in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -121,40 +189,154 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradCMAES(Algorithm):
+    """Minimize a scalar function using the Covariance Matrix Adaptation Evolution
+    Strategy (CMA-ES) algorithm.
+
+    The CMA-ES is a state-of-the-art evolutionary algorithm for difficult non-linear,
+    non-convex, black-box optimization problems in continuous domains. It is typically
+    applied to unconstrained or bounded problems with dimensionality between 3 and 100.
+    CMA-ES adapts a multivariate normal distribution to approximate the objective
+    function's shape by estimating a positive-definite covariance matrix, akin to the
+    inverse Hessian in convex-quadratic problems, but without requiring derivatives.
+
+    Original paper can be accessed at :cma:`
+    https://cma-es.github.io/`.
+     This    implementation is a python wrapper over the original code    :pycma:`
+    https://cma-es.github.io/`.
+
+    """
+
     scale: NonNegativeFloat = 1.0
+    """Scale of the search."""
+
     elitist: bool = False
+    """Whether to switch to elitist mode (also known as (μ,λ)-CMA-ES).
+
+    In elitist mode, the best point in the population is always retained.
+
+    """
+
     population_size: int | None = None
+    """Population size."""
+
     diagonal: bool = False
+    """Use the diagonal version of CMA, which is more efficient for high-dimensional
+    problems."""
+
     high_speed: bool = False
+    """Use a metamodel for recommendation to speed up optimization."""
+
     fast_cmaes: bool = False
+    """Use the fast CMA-ES implementation.
+
+    Cannot be used with diagonal=True. Produces equivalent results and is preferable for
+    high dimensions or when objective function evaluations are fast.
+
+    """
+
     random_init: bool = False
+    """If True, initialize the optimizer with random parameters."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     step_size_adaptive: bool | str = True
+    """Whether to adapt the step size.
+
+    Can be a boolean or a string specifying the adaptation strategy.
+
+    """
+
     CSA_dampfac: PositiveFloat = 1.0
+    """Damping factor for step size adaptation."""
+
     CMA_dampsvec_fade: PositiveFloat = 0.1
+    """Damping rate for step size adaptation."""
+
     CSA_squared: bool = False
+    """Whether to use squared step sizes in updates."""
+
     CMA_on: float = 1.0
+    """Learning rate for the covariance matrix update."""
+
     CMA_rankone: float = 1.0
+    """Multiplier for the rank-one update learning rate of the covariance matrix."""
+
     CMA_rankmu: float = 1.0
+    """Multiplier for the rank-mu update learning rate of the covariance matrix."""
+
     CMA_cmean: float = 1.0
+    """Learning rate for the mean update."""
+
     CMA_diagonal_decoding: float = 0.0
+    """Learning rate for the diagonal update."""
+
     num_parents: int | None = None
+    """Number of parents (μ) for recombination."""
+
     CMA_active: bool = True
+    """Whether to use negative updates for the covariance matrix."""
+
     CMA_mirrormethod: Literal[0, 1, 2] = 2
+    """Strategy for mirror sampling.
+
+    0: Unconditional, 1: Selective, 2: Selective
+    with delay.
+
+    """
+
     CMA_const_trace: bool | Literal["arithm", "geom", "aeig", "geig"] = False
+    """How to normalize the trace of the covariance matrix.
+
+    False: No normalization,
+    True: Normalize to 1. Other options: 'arithm', 'geom', 'aeig', 'geig'.
+
+    """
+
     CMA_diagonal: int | bool = False
+    """Number of iterations to use diagonal covariance matrix before switching to full
+    matrix.
+
+    If False, always use full matrix.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     stopping_maxiter: PositiveInt = STOPPING_MAXITER
+    """Maximum number of iterations before termination."""
+
     stopping_maxtime: PositiveFloat = float("inf")
+    """Maximum time in seconds before termination."""
+
     stopping_cov_mat_cond: NonNegativeFloat = 1e14
+    """Maximum condition number of the covariance matrix before termination."""
+
     convergence_ftol_abs: NonNegativeFloat = CONVERGENCE_FTOL_ABS
+    """Absolute tolerance on function value changes for convergence."""
+
     convergence_ftol_rel: NonNegativeFloat = CONVERGENCE_FTOL_REL
+    """Relative tolerance on function value changes for convergence."""
+
     convergence_xtol_abs: NonNegativeFloat = CONVERGENCE_XTOL_ABS
+    """Absolute tolerance on parameter changes for convergence."""
+
     convergence_iter_noimprove: PositiveInt | None = None
+    """Number of iterations without improvement before termination."""
+
     invariant_path: bool = False
+    """Whether evolution path (pc) should be invariant to transformations."""
+
     eval_final_mean: bool = True
+    """Whether to evaluate the final mean solution."""
+
     seed: int | None = None
+    """Seed used by the internal random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -231,11 +413,34 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradOnePlusOne(Algorithm):
+    """Minimize a scalar function using the One-Plus-One Evolutionary algorithm.
+
+    The One-Plus-One evolutionary algorithm iterates to find a set of parameters
+    that minimizes the loss function. It does this by perturbing, or mutating,
+    the parameters from the last iteration (the parent). If the new (child)
+    parameters yield a better result, the child becomes the new parent whose
+    parameters are perturbed, perhaps more aggressively. If the parent yields a
+    better result, it remains the parent and the next perturbation is less
+    aggressive.
+
+    Originally proposed by :cite:`Rechenberg1973`. The implementation in
+    Nevergrad is based on the one-fifth adaptation rule from :cite:`Schumer1968`.
+
+    """
+
     noise_handling: (
         Literal["random", "optimistic"]
         | tuple[Literal["random", "optimistic"], float]
         | None
     ) = None
+    """Method for handling noise.
+
+    'random' reevaluates a random point, while 'optimistic' reevaluates the best
+    optimistic point. A float coefficient can be provided to tune the regularity of
+    these reevaluations.
+
+    """
+
     mutation: Literal[
         "gaussian",
         "cauchy",
@@ -261,27 +466,75 @@ class NevergradOnePlusOne(Algorithm):
         "biglognormal",
         "hugelognormal",
     ] = "gaussian"
+    """Type of mutation to apply.
+
+    'gaussian' is the default. Other options include 'cauchy', 'discrete', 'fastga',
+    'rls', and 'portfolio'.
+
+    """
+
     annealing: (
         Literal[
             "none", "Exp0.9", "Exp0.99", "Exp0.9Auto", "Lin100.0", "Lin1.0", "LinAuto"
         ]
         | None
     ) = None
+    """Annealing schedule for mutation amplitude.
+
+    Can be 'none', exponential (e.g., 'Exp0.9'), or linear (e.g., 'Lin100.0').
+
+    """
+
     sparse: bool = False
+    """Whether to apply random mutations that set variables to zero."""
+
     super_radii: bool = False
+    """Whether to apply extended radii beyond standard bounds for candidate generation,
+    enabling broader exploration."""
+
     smoother: bool = False
+    """Whether to suggest smooth mutations."""
+
     roulette_size: PositiveInt = 64
+    """Size of the roulette wheel used for selection, affecting sampling diversity from
+    past candidates."""
+
     antismooth: NonNegativeInt = 4
+    """Degree of anti-smoothing to prevent premature convergence by penalizing overly
+    smooth improvements."""
+
     crossover: bool = False
+    """Whether to include a genetic crossover step every other iteration."""
+
     crossover_type: (
         Literal["none", "rand", "max", "min", "onepoint", "twopoint"] | None
     ) = None
+    """Method for genetic crossover.
+
+    Options include 'rand', 'onepoint', and 'twopoint'.
+
+    """
+
     tabu_length: NonNegativeInt = 1000
+    """Length of the tabu list to prevent revisiting recent candidates and help escape
+    local minima."""
+
     rotation: bool = False
+    """Whether to apply rotational transformations to the search space to enhance search
+    performance."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel computation."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)if bounds are not
+    provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -336,13 +589,32 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradDifferentialEvolution(Algorithm):
+    """Minimize a scalar function using the Differential Evolution optimizer.
+
+    Differential Evolution is typically used for continuous optimization. It uses
+    differences between points in the population for performing mutations in fruitful
+    directions. It is a kind of covariance adaptation without any explicit covariance,
+    making it very fast in high dimensions.
+
+    """
+
     initialization: Literal["parametrization", "LHS", "QR", "QO", "SO"] = (
         "parametrization"
     )
+    """Algorithm for initialization.
+
+    'LHS' is Latin Hypercube Sampling, 'QR' is Quasi-Random.
+
+    """
+
     scale: float | str = 1.0
+    """Scale of random component of updates."""
+
     recommendation: Literal["pessimistic", "optimistic", "mean", "noisy"] = (
         "pessimistic"
     )
+    """Criterion for selecting the best point to recommend."""
+
     crossover: (
         float
         | Literal[
@@ -354,14 +626,41 @@ class NevergradDifferentialEvolution(Algorithm):
             "parametrization",
         ]
     ) = 0.5
+    """Crossover rate or strategy.
+
+    Can be a float, 'dimension' (1/dim), 'random', 'onepoint', or 'twopoints'.
+
+    """
+
     F1: PositiveFloat = 0.8
+    """Differential weight #1 (scaling factor)."""
+
     F2: PositiveFloat = 0.8
+    """Differential weight #2 (scaling factor)."""
+
     population_size: int | Literal["standard", "dimension", "large"] = "standard"
+    """Population size.
+
+    Can be an integer or a string like 'standard', 'dimension', or 'large' to set it
+    automatically.
+
+    """
+
     high_speed: bool = False
+    """If True, uses a metamodel for recommendations to speed up optimization."""
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)if bounds are not
+    provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -371,7 +670,10 @@ def _solve_internal_problem(
 
         import nevergrad as ng
 
+        # The nevergrad implementation has `popsize` but we use `population_size`
+        # for consistency.
         configured_optimizer = ng.optimizers.DifferentialEvolution(
+            initialization=self.initialization,
             scale=self.scale,
             recommendation=self.recommendation,
             crossover=self.crossover,
@@ -397,7 +699,7 @@ def _solve_internal_problem(
 @mark.minimizer(
     name="nevergrad_bo",
     solver_type=AggregationLevel.SCALAR,
-    is_available=IS_NEVERGRAD_INSTALLED,
+    is_available=IS_NEVERGRAD_INSTALLED and IS_BAYESOPTIM_INSTALLED,
     is_global=True,
     needs_jac=False,
     needs_hess=False,
@@ -411,14 +713,43 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradBayesOptim(Algorithm):
+    """Minimize a scalar function using the Bayesian Optimization (BO) algorithm.
+
+    This wrapper uses the BO and PCA-BO algorithms from the `bayes_optim` package
+    :cite:`bayesoptimimpl`. PCA-BO (Principal Component Analysis for Bayesian
+    Optimization) is a dimensionality reduction technique for black-box
+    optimization. It applies PCA to the input space before performing Bayesian
+    optimization, improving efficiency in high dimensions by focusing on
+    directions of greatest variance.
+
+    """
+
     init_budget: int | None = None
+    """Number of initialization algorithm steps."""
+
     pca: bool = False
+    """Whether to use the PCA transformation, defining PCA-BO rather than standard
+    BO."""
+
     n_components: NonNegativeFloat = 0.95
+    """Number of principal axes, representing the percentage of explained variance
+    (e.g., 0.95 means 95% variance retained)."""
+
     prop_doe_factor: NonNegativeFloat | None = 1
+    """Percentage of the initial budget used for Design of Experiments (DoE)."""
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: int | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -465,14 +796,54 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradEMNA(Algorithm):
+    """Minimize a scalar function using the Estimation of Multivariate Normal Algorithm.
+
+    EMNA is a distribution-based evolutionary algorithm that models the search
+    space using a multivariate Gaussian. It learns the full covariance matrix,
+    resulting in a cubic time complexity with respect to each sampling. It is
+    efficient in parallel settings but other methods should be considered first.
+    See :cite:`emnaimpl`.
+
+    """
+
     isotropic: bool = True
+    """If True, uses an isotropic (identity covariance) Gaussian.
+
+    If False, uses a separable (diagonal covariance) Gaussian.
+
+    """
+
     noise_handling: bool = True
+    """If True, returns the best individual found.
+
+    If False (recommended for noisy problems), returns the average of the final
+    population.
+
+    """
+
     population_size_adaptation: bool = False
+    """If True, the population size is adjusted automatically based on the optimization
+    landscape and noise level."""
+
     initial_popsize: int | None = None
+    """Initial population size.
+
+    Defaults to 4 times the problem dimension.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -482,6 +853,8 @@ def _solve_internal_problem(
 
         import nevergrad as ng
 
+        # The nevergrad implementation has `naive` but we use `noise_handling`
+        # for clarity. naive=True -> returns best point; naive=False -> returns mean.
         configured_optimizer = ng.optimizers.EMNA(
             isotropic=self.isotropic,
             naive=self.noise_handling,
@@ -519,10 +892,27 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradCGA(Algorithm):
+    """Minimize a scalar function using the Compact Genetic Algorithm.
+
+    The Compact Genetic Algorithm (cGA) is a memory-efficient genetic algorithm
+    that represents the population as a probability vector over gene values. It
+    simulates the behavior of a simple GA with uniform crossover by updating
+    probabilities instead of maintaining an explicit population. See :cite:`cgaimpl`.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -564,10 +954,28 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradEDA(Algorithm):
+    """Minimize a scalar function using the Estimation of Distribution Algorithm.
+
+    Estimation of Distribution Algorithms (EDAs) optimize by building and sampling
+    a probabilistic model of promising solutions. Instead of using traditional
+    variation operators like crossover or mutation, EDAs update a distribution
+    based on selected individuals and sample new candidates from it.
+    Refer to :cite:`edaimpl`.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -609,12 +1017,43 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradTBPSA(Algorithm):
+    """Minimize a scalar function using the Test-based Population Size Adaptation
+    algorithm.
+
+    TBPSA adapts population size based on fitness trend detection using linear
+    regression. If no significant improvement is found (via hypothesis testing),
+    the population size is increased to improve robustness, making it effective
+    for noisy optimization problems. For more details, refer to :cite:`tbpsaimpl`.
+
+    """
+
     noise_handling: bool = True
+    """If True, returns the best individual.
+
+    If False (recommended for noisy problems), returns the average of the final
+    population to reduce noise.
+
+    """
+
     initial_popsize: int | None = None
+    """Initial population size.
+
+    If not specified, defaults to 4 times the problem dimension.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -624,6 +1063,8 @@ def _solve_internal_problem(
 
         import nevergrad as ng
 
+        # The nevergrad implementation has `naive` but we use `noise_handling`
+        # for clarity. naive=True -> returns best point; naive=False -> returns mean.
         configured_optimizer = ng.optimizers.ParametrizedTBPSA(
             naive=self.noise_handling,
             initial_popsize=self.initial_popsize,
@@ -659,16 +1100,52 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradRandomSearch(Algorithm):
+    """Minimize a scalar function using the Random Search algorithm.
+
+    This is a one-shot optimization method that provides random suggestions and serves
+    as a simple baseline for other optimizers.
+
+    """
+
     middle_point: bool = False
+    """Enforces that the first suggested point is the zero vector."""
+
     opposition_mode: Literal["opposite", "quasi"] | None = None
+    """Symmetrizes exploration with respect to the center.
+
+    'opposite' enables full symmetry, while 'quasi' applies randomized symmetry.
+
+    """
+
     sampler: Literal["parametrization", "gaussian", "cauchy"] = "parametrization"
+    """The probability distribution for sampling points.
+
+    'gaussian' and 'cauchy' are available alternatives.
+
+    """
+
     scale: PositiveFloat | Literal["random", "auto", "autotune"] = "auto"
+    """Scalar used to multiply suggested point values.
+
+    Can be a float or a string for auto-scaling ('random', 'auto', 'autotune').
+
+    """
+
     recommendation_rule: Literal[
         "average_of_best", "pessimistic", "average_of_exp_best"
     ] = "pessimistic"
+    """Specifies how the final recommendation is chosen, e.g., 'pessimistic' (default)
+    or 'average_of_best'."""
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -717,17 +1194,60 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradSamplingSearch(Algorithm):
+    """Minimize a scalar function using SamplingSearch.
+
+    This is a one-shot optimization method that is better than random search because it
+    uses low-discrepancy sequences to ensure more uniform coverage of the search space.
+    It is recommended to use "Hammersley" as the sampler if the budget is known, and to
+    set `scrambled=True` in high dimensions.
+
+    """
+
     sampler: Literal["Halton", "LHS", "Hammersley"] = "Halton"
+    """Choice of the low-discrepancy sampler used for generating points.
+
+    'LHS' is Latin Hypercube Sampling.
+
+    """
+
     scrambled: bool = False
+    """If True, adds scrambling to the search sequence, which is highly recommended for
+    high-dimensional problems."""
+
     middle_point: bool = False
+    """If True, the first suggested point is the zero vector, useful for initializing at
+    the center of the search space."""
+
     cauchy: bool = False
+    """If True, uses the inverse Cauchy distribution instead of Gaussian when projecting
+    samples to a real-valued space."""
+
     scale: bool | NonNegativeFloat = 1.0
+    """A float multiplier to scale all generated points."""
+
     rescaled: bool = False
+    """If True, rescales the sampling pattern to ensure better coverage of the
+    boundaries."""
+
     recommendation_rule: Literal["average_of_best", "pessimistic"] = "pessimistic"
+    """How the final recommendation is chosen.
+
+    'pessimistic' is the default.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -753,7 +1273,7 @@ def _solve_internal_problem(
             configured_optimizer=configured_optimizer,
             stopping_maxfun=self.stopping_maxfun,
             n_cores=self.n_cores,
-            seed=None,
+            seed=self.seed,
             sigma=self.sigma,
             nonlinear_constraints=problem.nonlinear_constraints,
         )
@@ -777,6 +1297,14 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradNGOpt(Algorithm):
+    """Minimize a scalar function using a Meta Optimizer from Nevergrad.
+
+    These are meta-optimizers that intelligently combine multiple different
+    optimization algorithms to solve a problem. The specific portfolio of
+    optimizers can be selected via the `optimizer` parameter.
+
+    """
+
     optimizer: Literal[
         "NGOpt",
         "NGOpt4",
@@ -831,10 +1359,24 @@ class NevergradNGOpt(Algorithm):
         "CSEC11",
         "Wiz",
     ] = "NGOpt"
+    """The specific Nevergrad meta-optimizer to use.
+
+    Each option is a portfolio of different algorithms.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²)in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -877,6 +1419,14 @@ def _solve_internal_problem(
 )
 @dataclass(frozen=True)
 class NevergradMeta(Algorithm):
+    """Minimize a scalar function using a Meta Optimizer from Nevergrad.
+
+    This algorithm utilizes a combination of local and global optimizers to find
+    the best solution. The specific portfolio of optimizers can be selected via
+    the `optimizer` parameter.
+
+    """
+
     optimizer: Literal[
         "MultiBFGSPlus",
         "LogMultiBFGSPlus",
@@ -916,10 +1466,24 @@ class NevergradMeta(Algorithm):
         "Shiwa",
         "Carola3",
     ] = "Shiwa"
+    """The specific Nevergrad meta-optimizer to use.
+
+    Each option is a portfolio of different local and global algorithms.
+
+    """
+
     stopping_maxfun: PositiveInt = STOPPING_MAXFUN_GLOBAL
+    """Maximum number of function evaluations before termination."""
+
     n_cores: PositiveInt = 1
+    """Number of cores to use for parallel function evaluation."""
+
     seed: int | None = None
+    """Seed for the random number generator for reproducibility."""
+
     sigma: float | None = None
+    """Standard deviation for sampling initial population from N(0, σ²) in case bounds
+    are not provided."""
 
     def _solve_internal_problem(
         self, problem: InternalOptimizationProblem, x0: NDArray[np.float64]
@@ -949,7 +1513,7 @@ def _nevergrad_internal(
     problem: InternalOptimizationProblem,
     x0: NDArray[np.float64],
     n_cores: int,
-    configured_optimizer: "ng.optimization.base.ConfiguredOptimizer",
+    configured_optimizer: ConfiguredOptimizer,
     stopping_maxfun: int,
     seed: int | None,
     sigma: float | None,