diff --git a/docs/source/contributing/developer_guide.md b/docs/source/contributing/developer_guide.md
index 7820b1f438..b051b8fd2f 100644
--- a/docs/source/contributing/developer_guide.md
+++ b/docs/source/contributing/developer_guide.md
@@ -245,7 +245,7 @@ usually created in order to optimise performance. But getting a
 possible (see also in
 {ref}`doc <pymc_overview##Transformed-distributions-and-changes-of-variables>`):
 
-.. code:: python
+.. code-block:: python
 
 
     lognorm = Exp().apply(pm.Normal.dist(0., 1.))
@@ -262,7 +262,7 @@ Now, back to ``model.RV(...)`` - things returned from ``model.RV(...)``
 are PyTensor tensor variables, and it is clear from looking at
 ``TransformedRV``:
 
-.. code:: python
+.. code-block:: python
 
     class TransformedRV(TensorVariable):
         ...
@@ -270,7 +270,7 @@ are PyTensor tensor variables, and it is clear from looking at
 as for ``FreeRV`` and ``ObservedRV``, they are ``TensorVariable``\s with
 ``Factor`` as mixin:
 
-.. code:: python
+.. code-block:: python
 
     class FreeRV(Factor, TensorVariable):
         ...
@@ -283,7 +283,7 @@ distribution into a ``TransformedDistribution``, and then ``model.Var`` is
 called again to added the RV associated with the
 ``TransformedDistribution`` as a ``FreeRV``:
 
-.. code:: python
+.. code-block:: python
 
         ...
         self.transformed = model.Var(
@@ -295,13 +295,13 @@ only add one ``FreeRV``. In another word, you *cannot* do chain
 transformation by nested applying multiple transforms to a Distribution
 (however, you can use ``Chain`` transformation.
 
-.. code:: python
+.. code-block:: python
 
     z = pm.LogNormal.dist(mu=0., sigma=1., transform=tr.Log)
     z.transform           # ==> pymc.distributions.transforms.Log
 
 
-.. code:: python
+.. code-block:: python
 
     z2 = Exp().apply(z)
     z2.transform is None  # ==> True
diff --git a/pymc/distributions/discrete.py b/pymc/distributions/discrete.py
index b1d188d461..7de2b72e23 100644
--- a/pymc/distributions/discrete.py
+++ b/pymc/distributions/discrete.py
@@ -1357,7 +1357,7 @@ class OrderedProbit:
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # Generate data for a simple 1 dimensional example problem
         n1_c = 300; n2_c = 300; n3_c = 300
diff --git a/pymc/distributions/multivariate.py b/pymc/distributions/multivariate.py
index 4415f5ec25..f76a98546e 100644
--- a/pymc/distributions/multivariate.py
+++ b/pymc/distributions/multivariate.py
@@ -1390,7 +1390,7 @@ class LKJCholeskyCov:
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model() as model:
             # Note that we access the distribution for the standard
@@ -1682,28 +1682,25 @@ class LKJCorr:
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model() as model:
-
             # Define the vector of fixed standard deviations
-            sds = 3*np.ones(10)
+            sds = 3 * np.ones(10)
 
-            corr = pm.LKJCorr(
-                'corr', eta=4, n=10, return_matrix=True
-            )
+            corr = pm.LKJCorr("corr", eta=4, n=10, return_matrix=True)
 
             # Define a new MvNormal with the given correlation matrix
-            vals = sds*pm.MvNormal('vals', mu=np.zeros(10), cov=corr, shape=10)
+            vals = sds * pm.MvNormal("vals", mu=np.zeros(10), cov=corr, shape=10)
 
             # Or transform an uncorrelated normal distribution:
-            vals_raw = pm.Normal('vals_raw', shape=10)
+            vals_raw = pm.Normal("vals_raw", shape=10)
             chol = pt.linalg.cholesky(corr)
-            vals = sds*pt.dot(chol,vals_raw)
+            vals = sds * pt.dot(chol, vals_raw)
 
             # The matrix is internally still sampled as a upper triangular vector
             # If you want access to it in matrix form in the trace, add
-            pm.Deterministic('corr_mat', corr)
+            pm.Deterministic("corr_mat", corr)
 
 
     References
@@ -1797,7 +1794,7 @@ class MatrixNormal(Continuous):
     Define a matrixvariate normal variable for given row and column covariance
     matrices.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
         import numpy as np
@@ -1820,16 +1817,20 @@ class MatrixNormal(Continuous):
     constant, both the covariance and scaling could be learned as follows
     (see the docstring of `LKJCholeskyCov` for more information about this)
 
-    .. code:: python
+    .. code-block:: python
 
         # Setup data
-        true_colcov = np.array([[1.0, 0.5, 0.1],
-                                [0.5, 1.0, 0.2],
-                                [0.1, 0.2, 1.0]])
+        true_colcov = np.array(
+            [
+                [1.0, 0.5, 0.1],
+                [0.5, 1.0, 0.2],
+                [0.1, 0.2, 1.0],
+            ]
+        )
         m = 3
         n = true_colcov.shape[0]
         true_scale = 3
-        true_rowcov = np.diag([true_scale**(2*i) for i in range(m)])
+        true_rowcov = np.diag([true_scale ** (2 * i) for i in range(m)])
         mu = np.zeros((m, n))
         true_kron = np.kron(true_rowcov, true_colcov)
         data = np.random.multivariate_normal(mu.flatten(), true_kron)
@@ -1838,13 +1839,12 @@ class MatrixNormal(Continuous):
         with pm.Model() as model:
             # Setup right cholesky matrix
             sd_dist = pm.HalfCauchy.dist(beta=2.5, shape=3)
-            colchol,_,_ = pm.LKJCholeskyCov('colchol', n=3, eta=2,sd_dist=sd_dist)
+            colchol, _, _ = pm.LKJCholeskyCov("colchol", n=3, eta=2, sd_dist=sd_dist)
             # Setup left covariance matrix
-            scale = pm.LogNormal('scale', mu=np.log(true_scale), sigma=0.5)
-            rowcov = pt.diag([scale**(2*i) for i in range(m)])
+            scale = pm.LogNormal("scale", mu=np.log(true_scale), sigma=0.5)
+            rowcov = pt.diag([scale ** (2 * i) for i in range(m)])
 
-            vals = pm.MatrixNormal('vals', mu=mu, colchol=colchol, rowcov=rowcov,
-                                observed=data)
+            vals = pm.MatrixNormal("vals", mu=mu, colchol=colchol, rowcov=rowcov, observed=data)
     """
 
     rv_op = matrixnormal
@@ -2010,15 +2010,15 @@ class KroneckerNormal(Continuous):
     Define a multivariate normal variable with a covariance
     :math:`K = K_1 \otimes K_2`
 
-    .. code:: python
+    .. code-block:: python
 
-        K1 = np.array([[1., 0.5], [0.5, 2]])
-        K2 = np.array([[1., 0.4, 0.2], [0.4, 2, 0.3], [0.2, 0.3, 1]])
+        K1 = np.array([[1.0, 0.5], [0.5, 2]])
+        K2 = np.array([[1.0, 0.4, 0.2], [0.4, 2, 0.3], [0.2, 0.3, 1]])
         covs = [K1, K2]
         N = 6
         mu = np.zeros(N)
         with pm.Model() as model:
-            vals = pm.KroneckerNormal('vals', mu=mu, covs=covs, shape=N)
+            vals = pm.KroneckerNormal("vals", mu=mu, covs=covs, shape=N)
 
     Efficiency gains are made by cholesky decomposing :math:`K_1` and
     :math:`K_2` individually rather than the larger :math:`K` matrix. Although
@@ -2026,14 +2026,14 @@ class KroneckerNormal(Continuous):
     number of submatrices can be combined in this way. Choleskys and
     eigendecompositions can be provided instead
 
-    .. code:: python
+    .. code-block:: python
 
         chols = [np.linalg.cholesky(Ki) for Ki in covs]
         evds = [np.linalg.eigh(Ki) for Ki in covs]
         with pm.Model() as model:
-            vals2 = pm.KroneckerNormal('vals2', mu=mu, chols=chols, shape=N)
+            vals2 = pm.KroneckerNormal("vals2", mu=mu, chols=chols, shape=N)
             # or
-            vals3 = pm.KroneckerNormal('vals3', mu=mu, evds=evds, shape=N)
+            vals3 = pm.KroneckerNormal("vals3", mu=mu, evds=evds, shape=N)
 
     neither of which will be converted. Diagonal noise can also be added to
     the covariance matrix, :math:`K = K_1 \otimes K_2 + \sigma^2 I_N`.
@@ -2042,13 +2042,13 @@ class KroneckerNormal(Continuous):
     utilizing eigendecompositons of the submatrices behind the scenes [1].
     Thus,
 
-    .. code:: python
+    .. code-block:: python
 
         sigma = 0.1
         with pm.Model() as noise_model:
-            vals = pm.KroneckerNormal('vals', mu=mu, covs=covs, sigma=sigma, shape=N)
-            vals2 = pm.KroneckerNormal('vals2', mu=mu, chols=chols, sigma=sigma, shape=N)
-            vals3 = pm.KroneckerNormal('vals3', mu=mu, evds=evds, sigma=sigma, shape=N)
+            vals = pm.KroneckerNormal("vals", mu=mu, covs=covs, sigma=sigma, shape=N)
+            vals2 = pm.KroneckerNormal("vals2", mu=mu, chols=chols, sigma=sigma, shape=N)
+            vals3 = pm.KroneckerNormal("vals3", mu=mu, evds=evds, sigma=sigma, shape=N)
 
     are identical, with `covs` and `chols` each converted to
     eigendecompositions.
diff --git a/pymc/gp/cov.py b/pymc/gp/cov.py
index a029e1e403..0df212aea1 100644
--- a/pymc/gp/cov.py
+++ b/pymc/gp/cov.py
@@ -956,7 +956,7 @@ class WrappedPeriodic(Covariance):
     In order to construct a kernel equivalent to the `Periodic` kernel you
     can do the following (though using `Periodic` will likely be a bit faster):
 
-    .. code:: python
+    .. code-block:: python
 
         exp_quad = pm.gp.cov.ExpQuad(1, ls=0.5)
         cov = pm.gp.cov.WrappedPeriodic(exp_quad, period=5)
diff --git a/pymc/gp/gp.py b/pymc/gp/gp.py
index ee198d1e2e..45f45b3e3a 100644
--- a/pymc/gp/gp.py
+++ b/pymc/gp/gp.py
@@ -117,7 +117,7 @@ class Latent(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # A one dimensional column vector of inputs.
         X = np.linspace(0, 1, 10)[:, None]
@@ -439,7 +439,7 @@ class Marginal(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # A one dimensional column vector of inputs.
         X = np.linspace(0, 1, 10)[:, None]
@@ -719,7 +719,7 @@ class MarginalApprox(Marginal):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # A one dimensional column vector of inputs.
         X = np.linspace(0, 1, 10)[:, None]
@@ -963,7 +963,7 @@ class LatentKron(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # One dimensional column vectors of inputs
         X1 = np.linspace(0, 1, 10)[:, None]
@@ -1066,7 +1066,7 @@ def conditional(self, name, Xnew, jitter=JITTER_DEFAULT, **kwargs):
         1, and 4, respectively, then `Xnew` must have 7 columns and a
         covariance between the prediction points
 
-        .. code:: python
+        .. code-block:: python
 
             cov_func(Xnew) = cov_func1(Xnew[:, :2]) * cov_func1(Xnew[:, 2:3]) * cov_func1(Xnew[:, 3:])
 
@@ -1118,13 +1118,13 @@ class MarginalKron(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # One dimensional column vectors of inputs
         X1 = np.linspace(0, 1, 10)[:, None]
         X2 = np.linspace(0, 2, 5)[:, None]
         Xs = [X1, X2]
-        y = np.random.randn(len(X1)*len(X2))  # toy data
+        y = np.random.randn(len(X1) * len(X2))  # toy data
         with pm.Model() as model:
             # Specify the covariance functions for each Xi
             cov_func1 = pm.gp.cov.ExpQuad(1, ls=0.1)  # Must accept X1 without error
@@ -1266,7 +1266,7 @@ def conditional(self, name, Xnew, pred_noise=False, diag=False, **kwargs):
         1, and 4, respectively, then `Xnew` must have 7 columns and a
         covariance between the prediction points
 
-        .. code:: python
+        .. code-block:: python
 
             cov_func(Xnew) = cov_func1(Xnew[:, :2]) * cov_func1(Xnew[:, 2:3]) * cov_func1(Xnew[:, 3:])
 
diff --git a/pymc/gp/hsgp_approx.py b/pymc/gp/hsgp_approx.py
index 9bad17ce87..cf889aadd1 100644
--- a/pymc/gp/hsgp_approx.py
+++ b/pymc/gp/hsgp_approx.py
@@ -215,7 +215,7 @@ class HSGP(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # A three dimensional column vector of inputs.
         X = np.random.rand(100, 3)
@@ -357,7 +357,7 @@ def prior_linearized(self, X: TensorLike):
 
         Examples
         --------
-        .. code:: python
+        .. code-block:: python
 
             # A one dimensional column vector of inputs.
             X = np.linspace(0, 10, 100)[:, None]
@@ -544,7 +544,7 @@ class HSGPPeriodic(Base):
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         # A three dimensional column vector of inputs.
         X = np.random.rand(100, 3)
@@ -640,7 +640,7 @@ def prior_linearized(self, X: TensorLike):
 
         Examples
         --------
-        .. code:: python
+        .. code-block:: python
 
             # A one dimensional column vector of inputs.
             X = np.linspace(0, 10, 100)[:, None]
@@ -665,8 +665,7 @@ def prior_linearized(self, X: TensorLike):
 
                 # The (non-centered) GP approximation is given by
                 f = pm.Deterministic(
-                    "f",
-                    phi_cos @ (psd * beta[:m]) + phi_sin[..., 1:] @ (psd[1:] * beta[m:])
+                    "f", phi_cos @ (psd * beta[:m]) + phi_sin[..., 1:] @ (psd[1:] * beta[m:])
                 )
                 ...
 
diff --git a/pymc/model/core.py b/pymc/model/core.py
index 5ec5c0ec34..26c5f15184 100644
--- a/pymc/model/core.py
+++ b/pymc/model/core.py
@@ -2204,7 +2204,7 @@ def Deterministic(name, var, model=None, dims=None):
     Indeed, PyMC allows for arbitrary combinations of random variables, for
     example in the case of a logistic regression
 
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model():
             alpha = pm.Normal("alpha", 0, 1)
@@ -2218,7 +2218,7 @@ def Deterministic(name, var, model=None, dims=None):
     ``p`` is important and one would like to track its value in the sampling
     trace, then one can use a deterministic node:
 
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model():
             alpha = pm.Normal("alpha", 0, 1)
@@ -2294,7 +2294,7 @@ def Potential(name, var: TensorVariable, model=None, dims=None) -> TensorVariabl
     The statement ``pm.math.log(pm.math.switch(constraint, 0, 1))`` adds either 0 or -inf to the model logp,
     depending on whether the constraint is met. During sampling, any proposals where ``x`` is negative will be rejected.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
@@ -2308,7 +2308,7 @@ def Potential(name, var: TensorVariable, model=None, dims=None) -> TensorVariabl
     Instead, with a soft constraint like ``pm.math.log(pm.math.switch(constraint, 1, 0.5))``,
     the sampler will be less likely, but not forbidden, from accepting negative values for `x`.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
@@ -2316,33 +2316,35 @@ def Potential(name, var: TensorVariable, model=None, dims=None) -> TensorVariabl
             x = pm.Normal("x", mu=0, sigma=1)
 
             constraint = x >= 0
-            potential = pm.Potential("x_constraint", pm.math.log(pm.math.switch(constraint, 1.0, 0.5)))
+            potential = pm.Potential(
+                "x_constraint", pm.math.log(pm.math.switch(constraint, 1.0, 0.5))
+            )
 
     A Potential term can depend on multiple variables.
     In the following example, the ``soft_sum_constraint`` potential encourages ``x`` and ``y`` to have a small sum.
     The more the sum deviates from zero, the more negative the penalty value of ``(-((x + y)**2))``.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
         with pm.Model() as model:
             x = pm.Normal("x", mu=0, sigma=10)
             y = pm.Normal("y", mu=0, sigma=10)
-            soft_sum_constraint = pm.Potential("soft_sum_constraint", -((x + y)**2))
+            soft_sum_constraint = pm.Potential("soft_sum_constraint", -((x + y) ** 2))
 
     A Potential can be used to define a specific prior term.
     The following example imposes a power law prior on `max_items`, under the form ``log(1/max_items)``,
     which penalizes very large values of `max_items`.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
         with pm.Model() as model:
             # p(max_items) = 1 / max_items
             max_items = pm.Uniform("max_items", lower=1, upper=100)
-            pm.Potential("power_prior", pm.math.log(1/max_items))
+            pm.Potential("power_prior", pm.math.log(1 / max_items))
 
             n_items = pm.Uniform("n_items", lower=1, upper=max_items, observed=60)
 
@@ -2350,13 +2352,15 @@ def Potential(name, var: TensorVariable, model=None, dims=None) -> TensorVariabl
     In the following example, a normal likelihood term is added to fixed data.
     The same result would be obtained by using an observed `Normal` variable.
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
+
         def normal_logp(value, mu, sigma):
             return -0.5 * ((value - mu) / sigma) ** 2 - pm.math.log(sigma)
 
+
         with pm.Model() as model:
             mu = pm.Normal("x")
             sigma = pm.HalfNormal("sigma")
diff --git a/pymc/pytensorf.py b/pymc/pytensorf.py
index 3f90a69ff7..804967a2c6 100644
--- a/pymc/pytensorf.py
+++ b/pymc/pytensorf.py
@@ -722,17 +722,19 @@ def collect_default_updates(
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
         from pytensor.scan import scan
         from pymc.pytensorf import collect_default_updates
 
+
         def scan_step(xtm1):
             x = xtm1 + pm.Normal.dist()
             x_update = collect_default_updates([x])
             return x, x_update
 
+
         x0 = pm.Normal.dist()
 
         xs, updates = scan(
diff --git a/pymc/sampling/deterministic.py b/pymc/sampling/deterministic.py
index 85ce0a1164..ea21273503 100644
--- a/pymc/sampling/deterministic.py
+++ b/pymc/sampling/deterministic.py
@@ -61,7 +61,7 @@ def compute_deterministics(
 
     Examples
     --------
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
diff --git a/pymc/sampling/forward.py b/pymc/sampling/forward.py
index c53dde90f2..d1703e82a3 100644
--- a/pymc/sampling/forward.py
+++ b/pymc/sampling/forward.py
@@ -603,7 +603,7 @@ def sample_posterior_predictive(
     The most common use of `sample_posterior_predictive` is to perform posterior predictive checks (in-sample predictions)
     and new model predictions (out-of-sample predictions).
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
@@ -632,7 +632,7 @@ def sample_posterior_predictive(
     For the last example we could have created a new predictions model. Note that we have to specify
     `var_names` explicitly, because the newly defined `y` was not given any observations:
 
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model(coords_mutable={"trial": [3, 4]}) as predictions_model:
             x = pm.MutableData("x", [-2, 2], dims=["trial"])
@@ -640,14 +640,16 @@ def sample_posterior_predictive(
             noise = pm.HalfNormal("noise")
             y = pm.Normal("y", mu=x * beta, sigma=noise, dims=["trial"])
 
-            predictions = pm.sample_posterior_predictive(idata, var_names=["y"], predictions=True).predictions
+            predictions = pm.sample_posterior_predictive(
+                idata, var_names=["y"], predictions=True
+            ).predictions
 
 
     The new model may even have a different structure and unobserved variables that don't exist in the trace.
     These variables will also be forward sampled. In the following example we added a new ``extra_noise``
     variable between the inferred posterior ``noise`` and the new StudentT observational distribution  ``y``:
 
-    .. code:: python
+    .. code-block:: python
 
         with pm.Model(coords_mutable={"trial": [3, 4]}) as distinct_predictions_model:
             x = pm.MutableData("x", [-2, 2], dims=["trial"])
@@ -656,7 +658,9 @@ def sample_posterior_predictive(
             extra_noise = pm.HalfNormal("extra_noise", sigma=noise)
             y = pm.StudentT("y", nu=4, mu=x * beta, sigma=extra_noise, dims=["trial"])
 
-            predictions = pm.sample_posterior_predictive(idata, var_names=["y"], predictions=True).predictions
+            predictions = pm.sample_posterior_predictive(
+                idata, var_names=["y"], predictions=True
+            ).predictions
 
 
     For more about out-of-model predictions, see this `blog post <https://www.pymc-labs.com/blog-posts/out-of-model-predictions-with-pymc/>`_.
@@ -682,7 +686,7 @@ def sample_posterior_predictive(
 
     The following code block explores how the behavior changes with different `var_names`:
 
-    .. code:: python
+    .. code-block:: python
 
         from logging import getLogger
         import pymc as pm
@@ -705,7 +709,7 @@ def sample_posterior_predictive(
     Default behavior. Generate samples of ``obs``, conditioned on the posterior samples of ``z`` found in the trace.
     These are often referred to as posterior predictive samples in the literature:
 
-    .. code:: python
+    .. code-block:: python
 
         with model:
             pm.sample_posterior_predictive(idata, var_names=["obs"], **kwargs)
@@ -782,7 +786,7 @@ def sample_posterior_predictive(
 
     You can manipulate the InferenceData to control the number of samples
 
-    .. code:: python
+    .. code-block:: python
 
         import pymc as pm
 
@@ -792,7 +796,7 @@ def sample_posterior_predictive(
 
     Generate 1 posterior predictive sample for every 5 posterior samples.
 
-    .. code:: python
+    .. code-block:: python
 
         thinned_idata = idata.sel(draw=slice(None, None, 5))
         with model:
@@ -801,7 +805,7 @@ def sample_posterior_predictive(
 
     Generate 5 posterior predictive samples for every posterior sample.
 
-    .. code:: python
+    .. code-block:: python
 
         expanded_idata = idata.copy()
         expanded_idata.posterior = idata.posterior.expand_dims(pred_id=5)
diff --git a/pymc/sampling/mcmc.py b/pymc/sampling/mcmc.py
index 542797caa8..de341c68cd 100644
--- a/pymc/sampling/mcmc.py
+++ b/pymc/sampling/mcmc.py
@@ -669,7 +669,7 @@ def sample(
 
     Examples
     --------
-    .. code:: ipython
+    .. code-block:: ipython
 
         In [1]: import pymc as pm
            ...: n = 100
diff --git a/pymc/variational/opvi.py b/pymc/variational/opvi.py
index deedfc8d9f..3cd5cc3dcf 100644
--- a/pymc/variational/opvi.py
+++ b/pymc/variational/opvi.py
@@ -628,7 +628,7 @@ class Group(WithMemoization):
     Passing the correct `vfam` (Variational FAMily) argument you'll tell what
     parametrization is desired for the group. This helps not to overload code with lots of classes.
 
-    .. code:: python
+    .. code-block:: python
 
         >>> group = Group([latent1, latent2], vfam="mean_field")
 
@@ -639,7 +639,7 @@ class Group(WithMemoization):
     passing both `vfam` and `params` is prohibited. Partial parametrization is prohibited by design to
     avoid corner cases and possible problems.
 
-    .. code:: python
+    .. code-block:: python
 
         >>> group = Group([latent3], params=dict(mu=my_mu, rho=my_rho))
 
@@ -663,7 +663,7 @@ class Group(WithMemoization):
     correctly initialized. For those groups which have group equal to `None` it will collect all
     the rest variables not covered by other groups and perform delayed init.
 
-    .. code:: python
+    .. code-block:: python
 
         >>> group_1 = Group([latent1], vfam="fr")  # latent1 has full rank approximation
         >>> group_other = Group(None, vfam="mf")  # other variables have mean field Q
@@ -674,7 +674,7 @@ class Group(WithMemoization):
     When you have created all the groups they need to pass all the groups to :class:`Approximation`.
     It does not accept any other parameter rather than `groups`
 
-    .. code:: python
+    .. code-block:: python
 
         >>> approx = Approximation(my_groups)