diff --git a/.gitignore b/.gitignore
index 38a17a91e5..209292ff09 100644
--- a/.gitignore
+++ b/.gitignore
@@ -175,7 +175,7 @@ examples/basic_usage/*.xyz
 extensions/
 
 # sphinx gallery
-docs/src/examples
+docs/src/generated_examples/
 *execution_times*
 
 # JavaScript
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 4d6792f547..7dae109215 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -13,9 +13,7 @@ build:
     rust: "1.75"
   jobs:
     pre_build:
-      - set -e && cd examples/ase && bash train.sh
-      - set -e && cd examples/llpr && bash train.sh
-      - set -e && cd examples/zbl && bash train.sh
+      - set -e && for f in $(find examples -name '*.sh'); do cd $(dirname $f); bash $(basename $f); cd -; done
 
 # Build documentation in the docs/ directory with Sphinx
 sphinx:
diff --git a/docs/generate_examples/conf.py b/docs/generate_examples/conf.py
index ffdccc12f7..39b920e5c7 100644
--- a/docs/generate_examples/conf.py
+++ b/docs/generate_examples/conf.py
@@ -15,34 +15,13 @@
 sphinx_gallery_conf = {
     "filename_pattern": r"/*\.py",
     "copyfile_regex": r".*\.(pt|sh|xyz|yaml)",
-    "ignore_pattern": r"train\.sh",
     "example_extensions": {".py", ".sh"},
     "default_thumb_file": os.path.join(ROOT, "docs/src/logo/metatrain-512.png"),
-    "examples_dirs": [
-        os.path.join(ROOT, "examples", "ase"),
-        os.path.join(ROOT, "examples", "llpr"),
-        os.path.join(ROOT, "examples", "zbl"),
-        os.path.join(ROOT, "examples", "programmatic", "use_architectures_outside"),
-        os.path.join(ROOT, "examples", "programmatic", "data_preparation"),
-        os.path.join(ROOT, "examples", "programmatic", "flashmd"),
-        os.path.join(ROOT, "examples", "basic_usage"),
-        os.path.join(ROOT, "examples", "train_from_scratch"),
-        os.path.join(ROOT, "examples", "validation"),
-        os.path.join(ROOT, "examples", "multi-gpu"),
-    ],
-    "gallery_dirs": [
-        os.path.join(ROOT, "docs", "src", "examples", "ase"),
-        os.path.join(ROOT, "docs", "src", "examples",  "llpr"),
-        os.path.join(ROOT, "docs", "src", "examples", "zbl"),
-        os.path.join(ROOT, "docs", "src", "examples", "programmatic", "use_architectures_outside"),
-        os.path.join(ROOT, "docs", "src", "examples", "programmatic", "data_preparation"),
-        os.path.join(ROOT, "docs", "src", "examples", "programmatic", "flashmd"),
-        os.path.join(ROOT, "docs", "src", "examples", "basic_usage"),
-        os.path.join(ROOT, "docs", "src", "examples", "train_from_scratch"),
-        os.path.join(ROOT, "docs", "src", "examples", "validation"),
-        os.path.join(ROOT, "docs", "src", "examples", "multi-gpu"),
-    ],
+    "examples_dirs": "../../examples",
+    "gallery_dirs": "generated_examples",
     "min_reported_time": 5,
     "matplotlib_animations": True,
-    "image_scrapers": ( ChemiscopeScraper(), ),
+    "image_scrapers": ["matplotlib", ChemiscopeScraper()],
+    "remove_config_comments": True,
+    "within_subsection_order": "FileNameSortKey",
 }
diff --git a/docs/src/architectures/nanopet.rst b/docs/src/architectures/nanopet.rst
index 821c93a004..a410ca9f7f 100644
--- a/docs/src/architectures/nanopet.rst
+++ b/docs/src/architectures/nanopet.rst
@@ -59,8 +59,7 @@ hyperparameters to tune are (in decreasing order of importance):
   neural network. Depending on the dataset, increasing this hyperparameter might lead to
   better accuracy, at the cost of increased training and evaluation time.
 - ``loss``: This section describes the loss function to be used. See the
-  :doc:`dedicated documentation page <../advanced-concepts/loss-functions>` for more
-  details.
+  :ref:`loss-functions` for more details.
 - ``long_range``: In some systems and datasets, enabling long-range Coulomb interactions
   might be beneficial for the accuracy of the model and/or its physical correctness.
   See below for a breakdown of the long-range section of the model hyperparameters.
diff --git a/docs/src/architectures/pet.rst b/docs/src/architectures/pet.rst
index e3d5a4442a..c64bb40eee 100644
--- a/docs/src/architectures/pet.rst
+++ b/docs/src/architectures/pet.rst
@@ -62,8 +62,7 @@ hyperparameters to tune are (in decreasing order of importance):
   neural network. Depending on the dataset, increasing this hyperparameter might lead to
   better accuracy, at the cost of increased training and evaluation time.
 - ``loss``: This section describes the loss function to be used. See the
-  :doc:`dedicated documentation page <../advanced-concepts/loss-functions>` for more
-  details.
+  :ref:`loss-functions` for more details.
 - ``long_range``: In some systems and datasets, enabling long-range Coulomb interactions
   might be beneficial for the accuracy of the model and/or its physical correctness.
   See below for a breakdown of the long-range section of the model hyperparameters.
diff --git a/docs/src/architectures/soap-bpnn.rst b/docs/src/architectures/soap-bpnn.rst
index 5cc62d0266..7ec55e0fe8 100644
--- a/docs/src/architectures/soap-bpnn.rst
+++ b/docs/src/architectures/soap-bpnn.rst
@@ -57,8 +57,7 @@ We explain below the model-specific hypers for SOAP-BPNN.
   this hyperparameter to ``false`` will lead to slower convergence of training, but
   might lead to better generalization outside of the training set distribution.
 - ``loss``: This section describes the loss function to be used. See the
-  :doc:`dedicated documentation page <../advanced-concepts/loss-functions>` for more
-  details.
+  :ref:`loss-functions` for more details.
 
 In addition to these model-specific hypers, we re-highlight that the following additive
 models (``zbl`` and ``long_range``) may be needed to achieve better description at the
diff --git a/docs/src/advanced-concepts/auxiliary-outputs.rst b/docs/src/concepts/auxiliary-outputs.rst
similarity index 100%
rename from docs/src/advanced-concepts/auxiliary-outputs.rst
rename to docs/src/concepts/auxiliary-outputs.rst
diff --git a/docs/src/advanced-concepts/index.rst b/docs/src/concepts/index.rst
similarity index 100%
rename from docs/src/advanced-concepts/index.rst
rename to docs/src/concepts/index.rst
diff --git a/docs/src/advanced-concepts/loss-functions.rst b/docs/src/concepts/loss-functions.rst
similarity index 100%
rename from docs/src/advanced-concepts/loss-functions.rst
rename to docs/src/concepts/loss-functions.rst
diff --git a/docs/src/advanced-concepts/output-naming.rst b/docs/src/concepts/output-naming.rst
similarity index 100%
rename from docs/src/advanced-concepts/output-naming.rst
rename to docs/src/concepts/output-naming.rst
diff --git a/docs/src/faq.rst b/docs/src/faq.rst
index ad6a5170b5..ff52d0cd6f 100644
--- a/docs/src/faq.rst
+++ b/docs/src/faq.rst
@@ -39,7 +39,7 @@ If available you can also try to run on a GPU, which significantly increases per
 Looking at a distribution of your energies per atom can help. Furthermore, outliers, such as large forces
 complicate training, so looking at the distribution of the forces and removing structures with large forces
 (e.g. all structures with forces with an absolute force > 20 eV/Å) from the dataset can help to stabilize training. For these tasks parity plots can be useful to find outliers.
-See our :ref:`sphx_glr_examples_validation_parity_plot.py` for how to create them.
+See our :ref:`sphx_glr_generated_examples_0-beginner_04-parity_plot.py` for how to create them.
 
 General training concepts
 -------------------------
@@ -63,7 +63,7 @@ correlations up to roughly 10 Å.
 **Q: In what format should I provide my data?**
 
 **A:** You can find everything on how to prepare your data in
-:ref:`sphx_glr_examples_programmatic_data_preparation_data_preparation.py`.
+:ref:`sphx_glr_generated_examples_0-beginner_01-data_preparation.py`.
 
 **Q: How small should my errors be before I can use my model to run Molecular Dynamics simulations?**
 
diff --git a/docs/src/getting-started/index.rst b/docs/src/getting-started/index.rst
index fc111aaf31..a64c3e3f1d 100644
--- a/docs/src/getting-started/index.rst
+++ b/docs/src/getting-started/index.rst
@@ -7,7 +7,6 @@ This sections describes how to install the package, and its most basic commands.
    :maxdepth: 1
 
    quickstart
-   ../examples/basic_usage/usage
    custom_dataset_conf
    advanced_base_config
    override
diff --git a/docs/src/getting-started/override.rst b/docs/src/getting-started/override.rst
index 5296db7972..eb6725361c 100644
--- a/docs/src/getting-started/override.rst
+++ b/docs/src/getting-started/override.rst
@@ -4,7 +4,7 @@ Override Architecture's Default Parameters
 In our initial tutorial, we used default parameters to train a model employing the
 SOAP-BPNN architecture, as shown in the following config:
 
-.. literalinclude:: ../../../examples/basic_usage/options.yaml
+.. literalinclude:: ../../../examples/0-beginner/options-basic.yaml
    :language: yaml
 
 While default parameters often serve as a good starting point, depending on your
diff --git a/docs/src/index.rst b/docs/src/index.rst
index c1379fdacb..411d386689 100644
--- a/docs/src/index.rst
+++ b/docs/src/index.rst
@@ -19,8 +19,8 @@
    getting-started/index
    configuration/index
    architectures/index
-   tutorials/index
-   advanced-concepts/index
+   generated_examples/index
+   concepts/index
    faq
    cite
    dev-docs/index
diff --git a/docs/src/tutorials/advanced_tutorials/index.rst b/docs/src/tutorials/advanced_tutorials/index.rst
deleted file mode 100644
index 075a804027..0000000000
--- a/docs/src/tutorials/advanced_tutorials/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-.. _advanced_tutorials:
-
-Advanced Tutorials
-==================
-
-This sections includes the advanced tutorials on the usage of the
-``metatrain`` package.
-
-.. toctree::
-   :maxdepth: 1
-
-   ../../advanced-concepts/transfer-learning
-   ../../examples/llpr/llpr
-   ../../examples/zbl/dimers
-   ../../advanced-concepts/fitting-generic-targets
-   ../../examples/programmatic/flashmd/flashmd
-   ../../examples/multi-gpu/multi-gpu
diff --git a/docs/src/tutorials/beginner_tutorials/index.rst b/docs/src/tutorials/beginner_tutorials/index.rst
deleted file mode 100644
index 5cec504471..0000000000
--- a/docs/src/tutorials/beginner_tutorials/index.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-.. _beginner_tutorials:
-
-Beginner Tutorials
-==================
-
-This sections includes the beginner tutorials on the usage of the
-``metatrain`` package.
-
-.. toctree::
-   :maxdepth: 1
-
-   ../../examples/programmatic/data_preparation/data_preparation
-   ../../advanced-concepts/fine-tuning
-   ../../examples/train_from_scratch/train_from_scratch
-   ../../examples/ase/run_ase
-   ../../examples/validation/parity_plot
diff --git a/docs/src/tutorials/index.rst b/docs/src/tutorials/index.rst
deleted file mode 100644
index 59abcd6d3b..0000000000
--- a/docs/src/tutorials/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-.. _tutorials:
-
-Tutorials
-=========
-
-This sections includes all tutorials on the usage of the
-``metatrain`` package. There are beginner tutorials to help you get started
-and advanced tutorials for more complex applications and workflows.
-
-.. toctree::
-   :maxdepth: 2
-
-   beginner_tutorials/index
-   advanced_tutorials/index
diff --git a/examples/programmatic/data_preparation/.gitignore b/examples/0-beginner/.gitignore
similarity index 61%
rename from examples/programmatic/data_preparation/.gitignore
rename to examples/0-beginner/.gitignore
index afae9d0eaf..6153de5dc0 100644
--- a/examples/programmatic/data_preparation/.gitignore
+++ b/examples/0-beginner/.gitignore
@@ -1,3 +1,5 @@
 data.xyz
+output.xyz
 qm9_reduced_100.zip
 qm9_reduced_100_all_at_once.zip
+carbon_reduced_100_memmap/*
diff --git a/examples/basic_usage/usage.sh b/examples/0-beginner/00-basic-usage.sh
old mode 100755
new mode 100644
similarity index 92%
rename from examples/basic_usage/usage.sh
rename to examples/0-beginner/00-basic-usage.sh
index c3ffc0acbf..6fd2a32e53
--- a/examples/basic_usage/usage.sh
+++ b/examples/0-beginner/00-basic-usage.sh
@@ -14,9 +14,8 @@ mtt --help
 #
 # We now demonstrate how to ``train`` and ``evaluate`` a model from the command line.
 # For this example we use the :ref:`architecture-soap-bpnn` architecture and a subset of
-# the `QM9 dataset <https://paperswithcode.com/dataset/qm9>`_. You can obtain the
-# dataset for this example here: :download:`qm9_reduced_100.xyz 
-# <../../../../examples/basic_usage/qm9_reduced_100.xyz>`.
+# the `QM9 dataset <https://www.nature.com/articles/sdata201422>`_. You can obtain the
+# dataset for this example here: :download:`qm9_reduced_100.xyz <qm9_reduced_100.xyz>`.
 #
 #
 # Training
@@ -44,7 +43,7 @@ mtt --help
 # corresponding documentation page. We will use these minimal options to run an example
 # training using the default hyperparameters of an SOAP BPNN model
 #
-# .. literalinclude:: ../../../../examples/basic_usage/options.yaml
+# .. literalinclude:: options-basic.yaml
 #    :language: yaml
 #
 # For each training run a new output directory in the format
@@ -54,7 +53,7 @@ mtt --help
 # in the current directory and type
 
 
-mtt train options.yaml
+mtt train options-basic.yaml
 
 # %%
 #
@@ -85,7 +84,7 @@ mtt train --help
 # system and possible target values for evaluation. The system section of this
 # ``eval.yaml`` is exactly the same as for a dataset in the ``options.yaml`` file.
 #
-# .. literalinclude:: ../../../../examples/basic_usage/eval.yaml
+# .. literalinclude:: eval-basic.yaml
 #    :language: yaml
 #
 # Note that the ``targets`` section is optional. If the ``targets`` section is present,
@@ -97,7 +96,7 @@ mtt train --help
 # trained model and the second an option file containing the path of the dataset for
 # evaulation. The extensions of the model, if any, can be specified via the ``-e`` flag.
 
-mtt eval model.pt eval.yaml -e extensions/
+mtt eval model.pt eval-basic.yaml -e extensions/
 
 # %%
 #
diff --git a/examples/programmatic/data_preparation/data_preparation.py b/examples/0-beginner/01-data_preparation.py
similarity index 98%
rename from examples/programmatic/data_preparation/data_preparation.py
rename to examples/0-beginner/01-data_preparation.py
index 52a049bcdf..a8dfa80f25 100644
--- a/examples/programmatic/data_preparation/data_preparation.py
+++ b/examples/0-beginner/01-data_preparation.py
@@ -189,7 +189,7 @@
 structures = ase.io.read("carbon_reduced_100.xyz", index=":")
 
 root = Path("carbon_reduced_100_memmap/")
-root.mkdir()
+root.mkdir(exist_ok=True)
 
 ns_path = root / "ns.npy"
 na_path = root / "na.npy"
@@ -236,7 +236,7 @@
 #
 # For example, you can use the following options file:
 #
-# .. literalinclude:: options.yaml
+# .. literalinclude:: options-memmap.yaml
 #    :language: yaml
 
-subprocess.run(["mtt", "train", "options.yaml"])
+subprocess.run(["mtt", "train", "options-memmap.yaml"])
diff --git a/docs/src/advanced-concepts/fine-tuning.rst b/examples/0-beginner/02-fine-tuning.py
similarity index 80%
rename from docs/src/advanced-concepts/fine-tuning.rst
rename to examples/0-beginner/02-fine-tuning.py
index 2d20f322e9..f31a484d24 100644
--- a/docs/src/advanced-concepts/fine-tuning.rst
+++ b/examples/0-beginner/02-fine-tuning.py
@@ -1,7 +1,8 @@
+r"""
 .. _fine-tuning:
 
-Fine-tuning
-===========
+Fine-tune a pre-trained model
+=============================
 
 .. warning::
 
@@ -15,7 +16,6 @@
 
 There is a complete example in :ref:`Fine-tune example <fine-tuning-example>`.
 
-
 .. note::
 
   Please note that the fine-tuning recommendations in this section are not universal
@@ -44,8 +44,8 @@
         method: "full" # This stands for the full fine-tuning
         read_from: path/to/checkpoint.ckpt
 
-We recommend to use a lower learning rate than the one used for the original training, as
-this will help stabilizing the training process. I.e. if the default learning rate is
+We recommend to use a lower learning rate than the one used for the original training,
+as this will help stabilizing the training process. I.e. if the default learning rate is
 ``1e-4``, you can set it to ``1e-5`` or even lower, using the following in the
 ``options.yaml`` file:
 
@@ -59,19 +59,19 @@
 will be taken from the checkpoint and not adapted to the new dataset.
 
 The basic fine-tuning strategy is a good choice in the case when the level of theory
-which is used for the original training is the same, or at least similar to the one used for
-the new dataset. However, since this is not always the case, we also provide more advanced
-fine-tuning strategies described below.
+which is used for the original training is the same, or at least similar to the one used
+for the new dataset. However, since this is not always the case, we also provide more
+advanced fine-tuning strategies described below.
 
 
 Fine-tuning model Heads
 -----------------------
 
-Adapting all the model weights to a new dataset is not always the best approach. If the new
-dataset consist of the same or similar data computed with a slightly different level of theory
-compared to the pre-trained models' dataset, you might want to keep the learned representations
-of the crystal structures and only adapt the readout layers (i.e. the model heads) to the new
-dataset.
+Adapting all the model weights to a new dataset is not always the best approach. If the
+new dataset consist of the same or similar data computed with a slightly different level
+of theory compared to the pre-trained models' dataset, you might want to keep the
+learned representations of the crystal structures and only adapt the readout layers
+(i.e. the model heads) to the new dataset.
 
 In this case, the ``mtt train`` command needs to be accompanied by the specific training
 options in the ``options.yaml`` file. The following options need to be set:
@@ -98,18 +98,19 @@
 edge heads and last layers to be fine-tuned.
 
 We recommend to first start the fine-tuning including all the modules listed above and
-experiment with their different combinations if needed. You might also consider using a lower
-learning rate, e.g. ``1e-5`` or even lower, to stabilize the training process.
+experiment with their different combinations if needed. You might also consider using a
+lower learning rate, e.g. ``1e-5`` or even lower, to stabilize the training process.
 
 
 LoRA Fine-tuning
 ----------------
 
-If the conceptually new type of structures is introduced in the new dataset, tuning only the
-model heads might not be sufficient. In this case, you might need to adapt the internal
-representations of the crystal structures. This can be done using the LoRA technique. However,
-in this case the model heads will be not adapted to the new dataset, so conceptually the
-level of theory should be consistent with the one used for the pre-trained model.
+If the conceptually new type of structures is introduced in the new dataset, tuning only
+the model heads might not be sufficient. In this case, you might need to adapt the
+internal representations of the crystal structures. This can be done using the LoRA
+technique. However, in this case the model heads will be not adapted to the new dataset,
+so conceptually the level of theory should be consistent with the one used for the
+pre-trained model.
 
 What is LoRA?
 ^^^^^^^^^^^^^
@@ -161,9 +162,9 @@
 ------------------------------------
 
 If the new dataset is computed with a totally different level of theory compared to the
-pre-trained model, which includes, for instance, the different composition energies,
-or you want to fine-tune the model on a completely new target, you might need to consider
-the transfer learning approach and introduce a new target in the
-``options.yaml`` file. More details about this approach can be found in the
-:ref:`Transfer Learning <transfer-learning>` section of the documentation.
-
+pre-trained model, which includes, for instance, the different composition energies, or
+you want to fine-tune the model on a completely new target, you might need to consider
+the transfer learning approach and introduce a new target in the ``options.yaml`` file.
+More details about this approach can be found in the :ref:`Transfer Learning
+<transfer-learning>` section of the documentation.
+"""
diff --git a/examples/train_from_scratch/train_from_scratch.py b/examples/0-beginner/03-train_from_scratch.py
similarity index 91%
rename from examples/train_from_scratch/train_from_scratch.py
rename to examples/0-beginner/03-train_from_scratch.py
index 3cf4c432c1..a639da7f55 100644
--- a/examples/train_from_scratch/train_from_scratch.py
+++ b/examples/0-beginner/03-train_from_scratch.py
@@ -1,11 +1,10 @@
 """
-.. _train-from-scratch:
-
 Training a model from scratch
-#############################
+=============================
+
 This tutorial explains how to train a model with ``metatrain`` from scratch and evaluate
-it. :download:`This dataset <ethanol_reduced_100.xyz>` is used here as an example of
-the preferred dataset format. If you have your own dataset, you can simply replace the
+it. :download:`This dataset <ethanol_reduced_100.xyz>` is used here as an example of the
+preferred dataset format. If you have your own dataset, you can simply replace the
 dataset file name with yours.
 
 Train the model
@@ -19,7 +18,7 @@
 
 .. _`Available Architectures`: https://metatensor.github.io/metatrain/latest/architectures/index.html
 
-.. literalinclude:: ./options.yaml
+.. literalinclude:: options-scratch.yaml
    :language: yaml
    :linenos:
 
@@ -27,7 +26,7 @@
 
 .. code-block:: bash
 
-    mtt train options.yaml
+  mtt train options-scratch.yaml
 
 It will start training. ``metatrain`` will automatically read the atomic forces from the
 training set, if they are stored in it and named as "forces". The model can also be
@@ -104,22 +103,23 @@
 
 .. code-block:: bash
 
-    mtt train options.yaml --restart model.ckpt
+  mtt train options-scratch.yaml --restart model.ckpt
 
 Evaluate the trained model
 --------------------------
+
 In order to evaluate the model on the test set, we can use the mtt eval sub-command.
-First, create the input file ``eval.yaml`` with the following options:
+First, create the input file ``eval-scratch.yaml`` with the following options:
 
-.. literalinclude:: ./eval.yaml
+.. literalinclude:: ./eval-scratch.yaml
    :language: yaml
    :linenos:
 
-and run
+and run (be sure to replace the path to the ``model.pt``)
 
 .. code-block:: bash
 
-    mtt eval PATH_TO_YOUR_MODEL/model.pt eval.yaml  # be sure to replace the path
+  mtt eval model.pt eval-scratch.yaml
 
 After this, a file named ``output.xyz`` will be created, with the atom positions and the
 predicted forces recorded in it. Also, you should see these statistical on your screen
@@ -139,9 +139,9 @@
 To run the script, download it from the repository, modify the paths as necessary
 (indicated with a #TODO), and run. This will generate a plot saved at parity_plot.png.
 
-
 Use the model
--------------------------
+-------------
+
 With the trained model, you can run molecular dynamics. Please refer to these two
 tutorials for `ASE`_ and `LAMMPS`_ to see how to do that.
 
diff --git a/examples/validation/parity_plot.py b/examples/0-beginner/04-parity_plot.py
similarity index 88%
rename from examples/validation/parity_plot.py
rename to examples/0-beginner/04-parity_plot.py
index e806c51c35..3375fa35e5 100644
--- a/examples/validation/parity_plot.py
+++ b/examples/0-beginner/04-parity_plot.py
@@ -1,11 +1,11 @@
 """
-Model validation with parity plots for energies and forces
-==========================================================
+Model validation with parity plots
+==================================
 
 This tutorial shows how to visualise your model output using parity plots. In the
-:ref:`train-from-scratch` we learned how to evaluate a trained model on a test set and
-save the results to an output file. Here we will show how to create parity plots from
-these results.
+:ref:`sphx_glr_generated_examples_0-beginner_03-train_from_scratch.py` we learned how to
+evaluate a trained model on a test set and save the results to an output file. Here we
+will show how to create parity plots from these results.
 
 """
 
@@ -18,12 +18,12 @@
 
 
 # %%
-# Load the target and predicted data
-targets = ase.io.read(
-    "../train_from_scratch/ethanol_reduced_100.xyz", ":"
-)  # reference data (ground truth)
-predictions = ase.io.read("output.xyz", ":")  # predicted data from the model
+# Load the reference data (ground truth)
+targets = ase.io.read("ethanol_reduced_100.xyz", ":")
 
+# %%
+# predicted data from a model
+predictions = ase.io.read("ethanol_reduced_100_predicted.xyz", ":")
 
 # %%
 # Extract the energies from the loaded frames
@@ -69,23 +69,28 @@
 axs[1].set_ylim([min_f, max_f])
 axs[1].set_title("Force Parity Plot")
 
-plt.tight_layout()
+fig.tight_layout()
 plt.show()
 
+# %%
+# We can also compute and print RMSE for energies and forces
+
 print(
     "RMSE energy (per atom):",
     np.sqrt(np.mean((e_targets - e_predictions) ** 2)),
     "kcal",
 )
 print("RMSE forces:", np.sqrt(np.mean((f_targets - f_predictions) ** 2)), "kcal/Å   ")
+
 # %%
+#
 # The results are a bit poor here because the model was not trained well enough and
 # was created only for demonstration purposes. In the case of a well-trained model, the
 # points should be closer to the diagonal line.
-
-# %%
+#
 # Check outliers with ``Chemiscope``
 # ----------------------------------
+#
 # With the approach above, you can inspect the whole dataset, but it might be difficult
 # to identify outliers. `Chemiscope <https://chemiscope.org/docs/index.html>` is a
 # visualisation tool, allowing you to explore the dataset interactively. The following
@@ -94,13 +99,15 @@
 
 for frame in targets + predictions:
     frame.arrays["forces"] = frame.get_forces()
+
 # a workaround, because the chemiscope interface for getting forces is broken with ASE
 # 3.23
 
 # %%
 # Plot the energy parity plot with Chemiscope. This can be rendered as a widget in a
 # Jupyter notebook.
-cs = chemiscope.show(
+
+chemiscope.show(
     targets,  # reading structures from the dataset
     properties={
         "Target energy": {"values": e_targets, "target": "structure", "units": "kcal"},
@@ -129,7 +136,6 @@
         },
     ),
 )
-cs
 
 # %%
 # You can check the structures by clicking the red dots on the parity plot, or
diff --git a/examples/ase/run_ase.py b/examples/0-beginner/05-run_ase.py
similarity index 91%
rename from examples/ase/run_ase.py
rename to examples/0-beginner/05-run_ase.py
index 06e9952fd7..9310d95b25 100644
--- a/examples/ase/run_ase.py
+++ b/examples/0-beginner/05-run_ase.py
@@ -12,22 +12,17 @@
 
 The model was trained using the following training options.
 
-.. literalinclude:: options.yaml
+.. literalinclude:: options-ase.yaml
    :language: yaml
 
-You can train the same model yourself with
+We first train the model same model but and before import the necessary libraries and
+run the training process and the integration of ASE.
 
-.. literalinclude:: train.sh
-   :language: bash
-
-A detailed step-by-step introduction on how to train a model is provided in
-the :ref:`label_basic_usage` tutorial.
 """
-
 # %%
 #
-# First, we start by importing the necessary libraries, including the integration of ASE
-# calculators for metatensor atomistic models.
+
+import subprocess
 
 import ase.md
 import ase.md.velocitydistribution
@@ -41,6 +36,14 @@
 
 # %%
 #
+
+subprocess.run(["mtt", "train", "options-ase.yaml", "--output", "model-md.pt"])
+
+# %%
+#
+# A detailed step-by-step introduction on how to train a model is provided in the
+# :ref:`label_basic_usage` tutorial.
+#
 # Setting up the simulation
 # -------------------------
 #
@@ -74,7 +77,7 @@
 # We now register our exported model as the energy calculator to obtain energies and
 # forces.
 
-atoms.calc = MetatomicCalculator("model.pt", extensions_directory="extensions/")
+atoms.calc = MetatomicCalculator("model-md.pt", extensions_directory="extensions/")
 
 # %%
 #
diff --git a/examples/0-beginner/GALLERY_HEADER.rst b/examples/0-beginner/GALLERY_HEADER.rst
new file mode 100644
index 0000000000..6bc08086ab
--- /dev/null
+++ b/examples/0-beginner/GALLERY_HEADER.rst
@@ -0,0 +1,7 @@
+.. _beginner_tutorials:
+
+Beginner Tutorials
+------------------
+
+This sections includes the beginner tutorials on the usage of the ``metatrain`` package.
+
diff --git a/examples/0-beginner/carbon_reduced_100.xyz b/examples/0-beginner/carbon_reduced_100.xyz
new file mode 120000
index 0000000000..5f4bb4cb8a
--- /dev/null
+++ b/examples/0-beginner/carbon_reduced_100.xyz
@@ -0,0 +1 @@
+../../tests/resources/carbon_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/ase/ethanol_reduced_100.xyz b/examples/0-beginner/ethanol_reduced_100.xyz
similarity index 100%
rename from examples/ase/ethanol_reduced_100.xyz
rename to examples/0-beginner/ethanol_reduced_100.xyz
diff --git a/examples/validation/output.xyz b/examples/0-beginner/ethanol_reduced_100_predicted.xyz
similarity index 100%
rename from examples/validation/output.xyz
rename to examples/0-beginner/ethanol_reduced_100_predicted.xyz
diff --git a/examples/basic_usage/eval.yaml b/examples/0-beginner/eval-basic.yaml
similarity index 100%
rename from examples/basic_usage/eval.yaml
rename to examples/0-beginner/eval-basic.yaml
diff --git a/examples/train_from_scratch/eval.yaml b/examples/0-beginner/eval-scratch.yaml
similarity index 100%
rename from examples/train_from_scratch/eval.yaml
rename to examples/0-beginner/eval-scratch.yaml
diff --git a/examples/ase/options.yaml b/examples/0-beginner/options-ase.yaml
similarity index 100%
rename from examples/ase/options.yaml
rename to examples/0-beginner/options-ase.yaml
diff --git a/examples/basic_usage/options.yaml b/examples/0-beginner/options-basic.yaml
similarity index 100%
rename from examples/basic_usage/options.yaml
rename to examples/0-beginner/options-basic.yaml
diff --git a/examples/programmatic/data_preparation/options.yaml b/examples/0-beginner/options-memmap.yaml
similarity index 100%
rename from examples/programmatic/data_preparation/options.yaml
rename to examples/0-beginner/options-memmap.yaml
diff --git a/examples/train_from_scratch/options.yaml b/examples/0-beginner/options-scratch.yaml
similarity index 100%
rename from examples/train_from_scratch/options.yaml
rename to examples/0-beginner/options-scratch.yaml
diff --git a/examples/basic_usage/qm9_reduced_100.xyz b/examples/0-beginner/qm9_reduced_100.xyz
similarity index 100%
rename from examples/basic_usage/qm9_reduced_100.xyz
rename to examples/0-beginner/qm9_reduced_100.xyz
diff --git a/examples/0-beginner/qm9_reduced_100.zip b/examples/0-beginner/qm9_reduced_100.zip
new file mode 100644
index 0000000000..26896a180b
Binary files /dev/null and b/examples/0-beginner/qm9_reduced_100.zip differ
diff --git a/examples/0-beginner/qm9_reduced_100_all_at_once.zip b/examples/0-beginner/qm9_reduced_100_all_at_once.zip
new file mode 100644
index 0000000000..8778c2f672
Binary files /dev/null and b/examples/0-beginner/qm9_reduced_100_all_at_once.zip differ
diff --git a/examples/1-advanced/.gitignore b/examples/1-advanced/.gitignore
new file mode 100644
index 0000000000..d0920d4284
--- /dev/null
+++ b/examples/1-advanced/.gitignore
@@ -0,0 +1 @@
+flashmd.xyz
diff --git a/docs/src/advanced-concepts/transfer-learning.rst b/examples/1-advanced/00-transfer-learning.py
similarity index 73%
rename from docs/src/advanced-concepts/transfer-learning.rst
rename to examples/1-advanced/00-transfer-learning.py
index eec6ea2cbd..c7b2b059c5 100644
--- a/docs/src/advanced-concepts/transfer-learning.rst
+++ b/examples/1-advanced/00-transfer-learning.py
@@ -1,3 +1,4 @@
+"""
 .. _transfer-learning:
 
 Transfer Learning (experimental)
@@ -32,12 +33,13 @@
 --------------------------------
 
 Training on a new level of theory is a common use case for transfer learning. It
-requires using a pre-trained model checkpoint with the ``mtt train`` command and setting the
-new targets corresponding to the new level of theory in the ``options.yaml`` file. Let's
-assume that the training is done on the dataset computed with the hybrid DFT functional
-(e.g. PBE0) stored in the ``new_train_dataset.xyz`` file, where the corresponsing
-energies and forces are written in the ``energy`` and ``forces`` key of the ``info`` dictionary
-of the ``ase.Atoms`` object. Then, the ``options.yaml`` file should look like this:
+requires using a pre-trained model checkpoint with the ``mtt train`` command and setting
+the new targets corresponding to the new level of theory in the ``options.yaml`` file.
+Let's assume that the training is done on the dataset computed with the hybrid DFT
+functional (e.g. PBE0) stored in the ``new_train_dataset.xyz`` file, where the
+corresponsing energies and forces are written in the ``energy`` and ``forces`` key of
+the ``info`` dictionary of the ``ase.Atoms`` object. Then, the ``options.yaml`` file
+should look like this:
 
 .. code-block:: yaml
 
@@ -71,11 +73,9 @@
 Fitting to a new set of properties
 ----------------------------------
 
-Training on a new set of properties is another common use case for
-transfer learning. It can be done in a similar way as training on a new
-level of theory. The only difference is that the new targets need to be
-properly set in the ``options.yaml`` file. More information about fitting the
-generic targets can be found in the :ref:`Fitting generic targets <fitting-generic-targets>`
-section of the documentation.
-
-
+Training on a new set of properties is another common use case for transfer learning. It
+can be done in a similar way as training on a new level of theory. The only difference
+is that the new targets need to be properly set in the ``options.yaml`` file. More
+information about fitting the generic targets can be found in the :ref:`Fitting generic
+targets <fitting-generic-targets>` section of the documentation.
+"""
diff --git a/examples/llpr/llpr.py b/examples/1-advanced/01-llpr.py
similarity index 86%
rename from examples/llpr/llpr.py
rename to examples/1-advanced/01-llpr.py
index b40c8e68cd..8979340c17 100644
--- a/examples/llpr/llpr.py
+++ b/examples/1-advanced/01-llpr.py
@@ -14,7 +14,7 @@
 The baseline model was trained using the following training options, where the training
 set consists of 100 structures from the QM9 dataset.
 
-.. literalinclude:: options.yaml
+.. literalinclude:: options-no-llpr.yaml
    :language: yaml
 
 Once a model is trained, you can add LLPR uncertainties to it by launching a training
@@ -29,17 +29,11 @@
 
 You can repeat the same training yourself with
 
-.. literalinclude:: train.sh
-   :language: bash
-
-A detailed step-by-step introduction on how to train a model is provided in
-the :ref:`label_basic_usage` tutorial.
 """
-
 # %%
 #
-# As an example, we will compute the energies and uncertainties of the LLPR model on a
-# few ethanol structures.
+
+import subprocess
 
 import ase.io
 import matplotlib.pyplot as plt
@@ -49,10 +43,28 @@
 from metatomic.torch.ase_calculator import MetatomicCalculator
 
 
-# load 5 ethanol structures
+# %%
+#
+# We first train the baseline model without uncertainties and then the LLPR model.
+
+
+subprocess.run(["mtt", "train", "options-no-llpr.yaml", "-o", "model.pt"])
+subprocess.run(["mtt", "train", "options-llpr.yaml", "-o", "model-llpr.pt"])
+
+# %%
+#
+# A detailed step-by-step introduction on how to train a model is provided in
+# the :ref:`label_basic_usage` tutorial.
+#
+# As an example, we will compute the energies and uncertainties of the LLPR model on a
+# few ethanol structures.
+#
+# Next we load 5 ethanol structures
 structures = ase.io.read("ethanol_reduced_100.xyz", ":5")
 
-# load the model as an ASE calculator
+# %%
+#
+# and load the model as an ASE calculator
 calc = MetatomicCalculator(
     "model-llpr.pt", extensions_directory="extensions/", device="cpu"
 )
diff --git a/examples/zbl/dimers.py b/examples/1-advanced/02-zbl.py
similarity index 93%
rename from examples/zbl/dimers.py
rename to examples/1-advanced/02-zbl.py
index 59c11a88bc..3fa6b5acdc 100644
--- a/examples/zbl/dimers.py
+++ b/examples/1-advanced/02-zbl.py
@@ -10,21 +10,15 @@
 
 The models are trained using the following training options, respectively:
 
-.. literalinclude:: options_no_zbl.yaml
+.. literalinclude:: options-no-zbl.yaml
    :language: yaml
 
 .. literalinclude:: options_zbl.yaml
     :language: yaml
 
-As you can see, they are identical, except for the ``zbl`` key in the
-``model`` section.
-You can train the same models yourself with
-
-.. literalinclude:: train.sh
-   :language: bash
-
-A detailed step-by-step introduction on how to train a model is provided in
-the :ref:`label_basic_usage` tutorial.
+As you can see, they are identical, except for the ``zbl`` key in the ``model`` section.
+A detailed step-by-step introduction on how to train a model is provided in the
+:ref:`label_basic_usage` tutorial.
 """
 
 # %%
@@ -32,6 +26,8 @@
 # First, we start by importing the necessary libraries, including the integration of ASE
 # calculators for metatensor atomistic models.
 
+import subprocess
+
 import ase
 import matplotlib.pyplot as plt
 import numpy as np
@@ -39,6 +35,11 @@
 from metatomic.torch.ase_calculator import MetatomicCalculator
 
 
+# %%
+
+subprocess.run(["mtt", "train", "options-no-zbl.yaml", "-o", "model_no_zbl.pt"])
+subprocess.run(["mtt", "train", "options_zbl.yaml", "-o", "model_zbl.pt"])
+
 # %%
 #
 # Setting up the dimers
diff --git a/docs/src/advanced-concepts/fitting-generic-targets.rst b/examples/1-advanced/03-fitting-generic-targets.py
similarity index 99%
rename from docs/src/advanced-concepts/fitting-generic-targets.rst
rename to examples/1-advanced/03-fitting-generic-targets.py
index 595a6b991c..ab84e6d0d1 100644
--- a/docs/src/advanced-concepts/fitting-generic-targets.rst
+++ b/examples/1-advanced/03-fitting-generic-targets.py
@@ -1,3 +1,4 @@
+"""
 .. _fitting-generic-targets:
 
 Fitting generic targets
@@ -155,3 +156,4 @@
   the ``TensorMap`` should have the ``o3_lambda`` and ``o3_sigma`` names, corresponding
   to the values provided in the input file, and each ``TensorBlock`` should be one
   component label, with name ``o3_mu`` and values going from -L to L.
+"""
diff --git a/examples/programmatic/flashmd/flashmd.py b/examples/1-advanced/04-flashmd.py
similarity index 97%
rename from examples/programmatic/flashmd/flashmd.py
rename to examples/1-advanced/04-flashmd.py
index 701716923b..970b523723 100644
--- a/examples/programmatic/flashmd/flashmd.py
+++ b/examples/1-advanced/04-flashmd.py
@@ -114,7 +114,7 @@ def get_structure_for_dataset(frame_now, frame_ahead):
 #
 # For example, you can use the following options file:
 #
-# .. literalinclude:: options.yaml
+# .. literalinclude:: options-flashmd.yaml
 #    :language: yaml
 
-subprocess.run(["mtt", "train", "options.yaml"])
+subprocess.run(["mtt", "train", "options-flashmd.yaml"])
diff --git a/examples/multi-gpu/multi-gpu.py b/examples/1-advanced/05-multi-gpu.py
similarity index 72%
rename from examples/multi-gpu/multi-gpu.py
rename to examples/1-advanced/05-multi-gpu.py
index 93031b659a..64864938e8 100644
--- a/examples/multi-gpu/multi-gpu.py
+++ b/examples/1-advanced/05-multi-gpu.py
@@ -1,5 +1,4 @@
 """
-==================
 Multi-GPU training
 ==================
 
@@ -13,18 +12,31 @@
 The different gradients obtained on each device are then summed. This approach allows
 the user to reduce the time it takes to train models.
 
-To know if the model supports multi-GPU training, please check `Available Architectures
-<../../docs/src/architectures/index.rst>`_ and see if the default hyperparameters have
-the ``distributed`` option.
+To know if the model supports multi-GPU training, please check
+:ref:`available-architectures` and see if the default hyperparameters have the
+``distributed`` option.
 
 Input file
 ----------
+
 To do this, you only need to switch on the ``distributed`` option in the ``.yaml`` file
-for the training. Let's take `this tutorial
-<../beginner_tutorials/train-from-scratch.rst>`_ as an example. Now, the
-``options.yaml`` is
+for the training. Let's take the
+:ref:`sphx_glr_generated_examples_0-beginner_03-train_from_scratch.py` example and
+adjust the ``options.yaml`` file.
+
+To know if the model supports multi-GPU training, please check
+:ref:`available-architectures` and see if the default hyperparameters have the
+``distributed`` option.
 
-.. literalinclude:: ./options-distributed.yaml
+Input file
+----------
+
+To do this, you only need to switch on the ``distributed`` option in the ``.yaml`` file
+for the training. Let's take the
+:ref:`sphx_glr_generated_examples_0-beginner_03-train_from_scratch.py` example and
+adjust the ``options.yaml`` file.
+
+.. literalinclude:: options-distributed.yaml
    :language: yaml
    :linenos:
 
@@ -35,8 +47,8 @@
 configurations vary from clusters to clusters, so you have to modify it. Different
 scheduler will require similar options. ``metatrain`` will automatically use all the
 GPUs that you have asked for. You should make a single GPU visible for each process
-(setting `--gpus-per-node` equal to the number of GPUs, or setting `--gpus-per-task=1`,
-depending on your cluster configuration).
+(setting ``--gpus-per-node`` equal to the number of GPUs, or setting
+``--gpus-per-task=1``, depending on your cluster configuration).
 
 .. code-block:: bash
 
@@ -84,7 +96,8 @@
 
 Multi-GPU fine-tuning
 ---------------------
+
 You can use multi-GPU for fine-tuning too, by writing ``distributed: True`` in the
-``.yaml`` input. For information about fine-tuning, please refer to `this tutorial on
-fine-tuning <../../getting-started/finetuning-example.rst>`_.
+``.yaml`` input. For information about fine-tuning, please refer to the
+:ref:`sphx_glr_generated_examples_0-beginner_02-fine-tuning.py` example.
 """
diff --git a/examples/1-advanced/GALLERY_HEADER.rst b/examples/1-advanced/GALLERY_HEADER.rst
new file mode 100644
index 0000000000..564319f571
--- /dev/null
+++ b/examples/1-advanced/GALLERY_HEADER.rst
@@ -0,0 +1,8 @@
+.. _advanced_tutorials:
+
+Advanced Tutorials
+------------------
+
+This sections includes the advanced tutorials on the usage of the ``metatrain`` package.
+If you are new to ``metatrain``, we recommend to start with the :ref:`label_quickstart`
+and the :ref:`beginner_tutorials`.
diff --git a/examples/llpr/ethanol_reduced_100.xyz b/examples/1-advanced/ethanol_reduced_100.xyz
similarity index 100%
rename from examples/llpr/ethanol_reduced_100.xyz
rename to examples/1-advanced/ethanol_reduced_100.xyz
diff --git a/examples/1-advanced/options-distributed-soap.yaml b/examples/1-advanced/options-distributed-soap.yaml
new file mode 120000
index 0000000000..9b9da26ac2
--- /dev/null
+++ b/examples/1-advanced/options-distributed-soap.yaml
@@ -0,0 +1 @@
+../../tests/distributed/options-distributed.yaml
\ No newline at end of file
diff --git a/examples/1-advanced/options-distributed.yaml b/examples/1-advanced/options-distributed.yaml
new file mode 120000
index 0000000000..9b9da26ac2
--- /dev/null
+++ b/examples/1-advanced/options-distributed.yaml
@@ -0,0 +1 @@
+../../tests/distributed/options-distributed.yaml
\ No newline at end of file
diff --git a/examples/programmatic/flashmd/options.yaml b/examples/1-advanced/options-flashmd.yaml
similarity index 100%
rename from examples/programmatic/flashmd/options.yaml
rename to examples/1-advanced/options-flashmd.yaml
diff --git a/examples/llpr/options-llpr.yaml b/examples/1-advanced/options-llpr.yaml
similarity index 100%
rename from examples/llpr/options-llpr.yaml
rename to examples/1-advanced/options-llpr.yaml
diff --git a/examples/llpr/options.yaml b/examples/1-advanced/options-no-llpr.yaml
similarity index 100%
rename from examples/llpr/options.yaml
rename to examples/1-advanced/options-no-llpr.yaml
diff --git a/examples/zbl/options_no_zbl.yaml b/examples/1-advanced/options-no-zbl.yaml
similarity index 100%
rename from examples/zbl/options_no_zbl.yaml
rename to examples/1-advanced/options-no-zbl.yaml
diff --git a/examples/zbl/options_zbl.yaml b/examples/1-advanced/options_zbl.yaml
similarity index 100%
rename from examples/zbl/options_zbl.yaml
rename to examples/1-advanced/options_zbl.yaml
diff --git a/examples/llpr/qm9_reduced_100.xyz b/examples/1-advanced/qm9_reduced_100.xyz
similarity index 100%
rename from examples/llpr/qm9_reduced_100.xyz
rename to examples/1-advanced/qm9_reduced_100.xyz
diff --git a/examples/GALLERY_HEADER.rst b/examples/GALLERY_HEADER.rst
new file mode 100644
index 0000000000..f5de0c2442
--- /dev/null
+++ b/examples/GALLERY_HEADER.rst
@@ -0,0 +1,6 @@
+Tutorials
+=========
+
+Here we present beginner and advanced examples of how to use ``metatrain`` for
+various applications. The beginner examples cover basic usage and data
+preparation, while the advanced examples demonstrate more complex scenarios.
diff --git a/examples/README.rst b/examples/README.rst
deleted file mode 100644
index e7c763f8e1..0000000000
--- a/examples/README.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-Metatrain Examples
-==================
-
-This folder consists of introductory and advanced examples.
diff --git a/examples/ase/README.rst b/examples/ase/README.rst
deleted file mode 100644
index 50f7bdb413..0000000000
--- a/examples/ase/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Running molecular dynamics with ASE
-===================================
diff --git a/examples/ase/train.sh b/examples/ase/train.sh
deleted file mode 100755
index f73b048e61..0000000000
--- a/examples/ase/train.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-mtt train options.yaml
diff --git a/examples/basic_usage/README.rst b/examples/basic_usage/README.rst
deleted file mode 100644
index a93f89a85c..0000000000
--- a/examples/basic_usage/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Basic usage of the metatrain CLI
-================================
diff --git a/examples/beginner/README.rst b/examples/beginner/README.rst
deleted file mode 100644
index 6193a314f0..0000000000
--- a/examples/beginner/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Beginner tutorials
-==================
diff --git a/examples/llpr/README.rst b/examples/llpr/README.rst
deleted file mode 100644
index f623186ad5..0000000000
--- a/examples/llpr/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Computing LLPR uncertainties
-============================
diff --git a/examples/llpr/train.sh b/examples/llpr/train.sh
deleted file mode 100644
index f0bb577cc7..0000000000
--- a/examples/llpr/train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-mtt train options.yaml -o model.pt
-mtt train options-llpr.yaml -o model-llpr.pt
diff --git a/examples/multi-gpu/README.rst b/examples/multi-gpu/README.rst
deleted file mode 100644
index 099549c1e0..0000000000
--- a/examples/multi-gpu/README.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-==================
-Multi-GPU training
-==================
diff --git a/examples/multi-gpu/ethanol_reduced_100.xyz b/examples/multi-gpu/ethanol_reduced_100.xyz
deleted file mode 120000
index f01afa4c67..0000000000
--- a/examples/multi-gpu/ethanol_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../ase/ethanol_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/multi-gpu/options-distributed.yaml b/examples/multi-gpu/options-distributed.yaml
deleted file mode 100644
index 6e9c43ac33..0000000000
--- a/examples/multi-gpu/options-distributed.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-architecture:
-  name: pet
-  model:
-    cutoff: 4.5
-  training:
-    distributed: true  # switch on distributed training mode
-    num_epochs: 100  # increased to 100 to compare with non-distributed
-    batch_size: 10
-    log_interval: 1
-    checkpoint_interval: 10
-
-training_set:
-  systems:
-    read_from: ethanol_reduced_100.xyz
-    length_unit: Angstrom
-  targets:
-    energy:
-      key: energy
-      unit: eV
-
-test_set: 0.1
-validation_set: 0.1
diff --git a/examples/multi-gpu/soap-bpnn/options-distributed.yaml b/examples/multi-gpu/soap-bpnn/options-distributed.yaml
deleted file mode 120000
index d317b95012..0000000000
--- a/examples/multi-gpu/soap-bpnn/options-distributed.yaml
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/distributed/options-distributed.yaml
\ No newline at end of file
diff --git a/examples/multi-gpu/soap-bpnn/submit-distributed.sh b/examples/multi-gpu/soap-bpnn/submit-distributed.sh
deleted file mode 120000
index 5b31a6b9bc..0000000000
--- a/examples/multi-gpu/soap-bpnn/submit-distributed.sh
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/distributed/submit-distributed.sh
\ No newline at end of file
diff --git a/examples/programmatic/data_preparation/README.rst b/examples/programmatic/data_preparation/README.rst
deleted file mode 100644
index eccaed31e6..0000000000
--- a/examples/programmatic/data_preparation/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-How to prepare data for training
-================================
diff --git a/examples/programmatic/data_preparation/carbon_reduced_100.xyz b/examples/programmatic/data_preparation/carbon_reduced_100.xyz
deleted file mode 120000
index a702ba7615..0000000000
--- a/examples/programmatic/data_preparation/carbon_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/resources/carbon_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/programmatic/data_preparation/qm9_reduced_100.xyz b/examples/programmatic/data_preparation/qm9_reduced_100.xyz
deleted file mode 120000
index a98d028721..0000000000
--- a/examples/programmatic/data_preparation/qm9_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/resources/qm9_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/programmatic/flashmd/README.rst b/examples/programmatic/flashmd/README.rst
deleted file mode 100644
index 870044d0b5..0000000000
--- a/examples/programmatic/flashmd/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Training a FlashMD model
-========================
diff --git a/examples/programmatic/use_architectures_outside/README.rst b/examples/programmatic/use_architectures_outside/README.rst
deleted file mode 100644
index b3c4c2c56d..0000000000
--- a/examples/programmatic/use_architectures_outside/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Using metatrain architectures outside of metatrain
-==================================================
diff --git a/examples/programmatic/use_architectures_outside/qm9_reduced_100.xyz b/examples/programmatic/use_architectures_outside/qm9_reduced_100.xyz
deleted file mode 120000
index a98d028721..0000000000
--- a/examples/programmatic/use_architectures_outside/qm9_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../../../tests/resources/qm9_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/programmatic/use_architectures_outside/use_outside.py b/examples/programmatic/use_architectures_outside/use_outside.py
deleted file mode 100644
index 63ac1eb38e..0000000000
--- a/examples/programmatic/use_architectures_outside/use_outside.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""
-Using metatrain architectures outside of metatrain
-==================================================
-
-This tutorial demonstrates how to use one of metatrain's implemented architectures
-outside of metatrain. This will be done by taking internal representations of a
-NanoPET model (as an example) and using them inside a user-defined torch ``Module``.
-
-Only architectures which can output internal representations ("features" output) can
-be used in this way.
-"""
-
-# %%
-#
-
-import torch
-from metatomic.torch import ModelOutput
-
-from metatrain.pet import PET
-from metatrain.utils.architectures import get_default_hypers
-from metatrain.utils.data import DatasetInfo, read_systems
-from metatrain.utils.neighbor_lists import (
-    get_requested_neighbor_lists,
-    get_system_with_neighbor_lists,
-)
-
-
-# %%
-#
-# Read some sample systems. Metatrain always reads systems in float64, while torch
-# uses float32 by default. We will convert the systems to float32.
-
-systems = read_systems("qm9_reduced_100.xyz")
-systems = [s.to(torch.float32) for s in systems]
-
-
-# %%
-#
-# Define the custom model using the PET architecture as a building block.
-# The dummy architecture here adds a linear layer and a tanh activation function
-# on top of the PET model.
-
-
-class PETWithTanh(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.pet = PET(
-            get_default_hypers("pet")["model"],
-            DatasetInfo(
-                length_unit="angstrom",
-                atomic_types=[1, 6, 7, 8, 9],
-                targets={},
-            ),
-        )
-        self.linear = torch.nn.Linear(384, 1)
-        self.tanh = torch.nn.Tanh()
-
-    def forward(self, systems):
-        model_outputs = self.pet(
-            systems,
-            {"features": ModelOutput()},
-            # ModelOutput(per_atom=True) would give per-atom features
-        )
-        features = model_outputs["features"].block().values
-        return self.tanh(self.linear(features))
-
-
-# %%
-#
-# Now we can train the custom model. Here is one training step executed with
-# some random targets.
-my_targets = torch.randn(100, 1)
-
-# instantiate the model
-model = PETWithTanh()
-
-# all metatrain models require neighbor lists to be present in the input systems
-systems = [
-    get_system_with_neighbor_lists(sys, get_requested_neighbor_lists(model))
-    for sys in systems
-]
-
-# define an optimizer
-optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
-
-# this is one training step
-predictions = model(systems)
-loss = torch.nn.functional.mse_loss(predictions, my_targets)
-loss.backward()
-optimizer.step()
diff --git a/examples/train_from_scratch/README.rst b/examples/train_from_scratch/README.rst
deleted file mode 100644
index ac545fe554..0000000000
--- a/examples/train_from_scratch/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Training a model from scratch
-=============================
diff --git a/examples/train_from_scratch/ethanol_reduced_100.xyz b/examples/train_from_scratch/ethanol_reduced_100.xyz
deleted file mode 120000
index f01afa4c67..0000000000
--- a/examples/train_from_scratch/ethanol_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../ase/ethanol_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/validation/README.rst b/examples/validation/README.rst
deleted file mode 100644
index 4340241618..0000000000
--- a/examples/validation/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Creating parity plots for validation
-====================================
diff --git a/examples/zbl/README.rst b/examples/zbl/README.rst
deleted file mode 100644
index 50f7bdb413..0000000000
--- a/examples/zbl/README.rst
+++ /dev/null
@@ -1,2 +0,0 @@
-Running molecular dynamics with ASE
-===================================
diff --git a/examples/zbl/ethanol_reduced_100.xyz b/examples/zbl/ethanol_reduced_100.xyz
deleted file mode 120000
index acaad11d93..0000000000
--- a/examples/zbl/ethanol_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../../tests/resources/ethanol_reduced_100.xyz
\ No newline at end of file
diff --git a/examples/zbl/train.sh b/examples/zbl/train.sh
deleted file mode 100755
index 03b6baab24..0000000000
--- a/examples/zbl/train.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-
-mtt train options_no_zbl.yaml -o model_no_zbl.pt
-mtt train options_zbl.yaml -o model_zbl.pt
diff --git a/src/metatrain/gap/tests/ethanol_reduced_100.xyz b/src/metatrain/gap/tests/ethanol_reduced_100.xyz
deleted file mode 120000
index 8084906dc2..0000000000
--- a/src/metatrain/gap/tests/ethanol_reduced_100.xyz
+++ /dev/null
@@ -1 +0,0 @@
-../../../../examples/ase/ethanol_reduced_100.xyz
\ No newline at end of file
diff --git a/tox.ini b/tox.ini
index c915475d08..a5395a98c4 100644
--- a/tox.ini
+++ b/tox.ini
@@ -223,12 +223,11 @@ extras = # these architectures are used in the documentation
     flashmd
     soap-bpnn
 commands =
-    {env:ENV_INSTALLER:python -m pip} install --no-build-isolation sphericart torch-spex>=0.1,<0.2 wigners
-    # Run example and usage scripts.
-    bash -c "set -e && cd {toxinidir}/examples/basic_usage && bash usage.sh"
-    bash -c "set -e && cd {toxinidir}/examples/ase && bash train.sh"
-    bash -c "set -e && cd {toxinidir}/examples/llpr && bash train.sh"
-    bash -c "set -e && cd {toxinidir}/examples/zbl && bash train.sh"
+    uv pip install --no-build-isolation sphericart torch-spex>=0.1,<0.2 wigners
+
+    # Run all .sh files in the example folder (including subfolders)
+    bash -c "set -e && for f in $(find examples -name '*.sh'); do cd $(dirname $f); bash $(basename $f); cd -; done"
+
     sphinx-build \
         {posargs:-E} \
         --builder html \