Update

vmoens · vmoens · commit 7a2871656c19 · 2025-10-24T18:32:14.000-07:00
[ghstack-poisoned]
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -26,7 +26,7 @@ jobs:
   build-docs:
     strategy:
       matrix:
-        python_version: [ "3.9" ]
+        python_version: [ "3.12" ]
         cuda_arch_version: [ "12.8" ]
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
@@ -60,7 +60,7 @@ jobs:
         bash ./miniconda.sh -b -f -p "${conda_dir}"
         eval "$(${conda_dir}/bin/conda shell.bash hook)"
         printf "* Creating a test environment\n"
-        conda create --prefix "${env_dir}" -y python=3.9
+        conda create --prefix "${env_dir}" -y python=3.12
         printf "* Activating\n"
         conda activate "${env_dir}"
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -16,7 +16,7 @@ sphinx_design
 torchvision
 dm_control
 mujoco<3.3.6
-gym[classic_control,accept-rom-license,ale-py,atari]
+gymnasium[classic_control,atari]
 pygame
 tqdm
 ipython
diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst
@@ -507,7 +507,7 @@ Training and Optimization Configurations
     SparseAdamConfig
 
 Logging Configurations
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~
 
 .. currentmodule:: torchrl.trainers.algorithms.configs.logging
 
diff --git a/docs/source/reference/envs.rst b/docs/source/reference/envs.rst
@@ -1123,7 +1123,6 @@ to be able to create this other composition:
     ExcludeTransform
     FiniteTensorDictCheck
     FlattenObservation
-    FlattenTensorDict
     FrameSkipTransform
     GrayScale
     Hash
diff --git a/docs/source/reference/llms.rst b/docs/source/reference/llms.rst
@@ -118,9 +118,9 @@ Usage
 Adding Custom Templates
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-You can add custom chat templates for new model families using the :func:`torchrl.data.llm.chat.add_chat_template` function.
+You can add custom chat templates for new model families using the :func:`torchrl.data.llm.add_chat_template` function.
 
-.. autofunction:: torchrl.data.llm.chat.add_chat_template
+.. autofunction:: torchrl.data.llm.add_chat_template
 
 Usage Examples
 ^^^^^^^^^^^^^^
@@ -130,7 +130,7 @@ Adding a Llama Template
 
 .. code-block:: python
 
-    >>> from torchrl.data.llm.chat import add_chat_template, History
+    >>> from torchrl.data.llm import add_chat_template, History
     >>> from transformers import AutoTokenizer
     >>> 
     >>> # Define the Llama chat template
diff --git a/docs/source/reference/utils.rst b/docs/source/reference/utils.rst
@@ -1,7 +1,7 @@
 .. currentmodule:: torchrl
 
 torchrl._utils package
-====================
+======================
 
 Set of utility methods that are used internally by the library.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -149,4 +149,4 @@ first_party_detection = false
 [project.entry-points."vllm.general_plugins"]
 # Ensure FP32 overrides are registered in all vLLM processes (main, workers, and
 # the registry subprocess) before resolving model classes.
-fp32_overrides = "torchrl.modules.llm.backends.vllm_plugin:register_fp32_overrides"
+fp32_overrides = "torchrl.modules.llm.backends.vllm.vllm_plugin:register_fp32_overrides"
diff --git a/test/llm/test_vllm.py b/test/llm/test_vllm.py
@@ -40,7 +40,7 @@ class TestAsyncVLLMIntegration:
     @pytest.mark.slow
     def test_vllm_api_compatibility(self, sampling_params):
         """Test that AsyncVLLM supports the same inputs as vLLM.LLM.generate()."""
-        from torchrl.modules.llm.backends.vllm_async import AsyncVLLM
+        from torchrl.modules.llm.backends import AsyncVLLM
 
         # Create AsyncVLLM service
         service = AsyncVLLM.from_pretrained(
@@ -113,7 +113,7 @@ def test_vllm_api_compatibility(self, sampling_params):
     def test_weight_updates_with_transformer(self, sampling_params):
         """Test weight updates using vLLMUpdater with a real transformer model."""
         from torchrl.collectors.llm.weight_update.vllm import vLLMUpdater
-        from torchrl.modules.llm.backends.vllm_async import AsyncVLLM
+        from torchrl.modules.llm.backends import AsyncVLLM
         from torchrl.modules.llm.policies.transformers_wrapper import (
             TransformersWrapper,
         )
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -18,7 +18,7 @@
 from tensordict.utils import _zip_strict
 from torchrl.data.llm import History
 from torchrl.envs.llm.transforms.kl import KLComputation, RetrieveKL, RetrieveLogProb
-from torchrl.modules.llm.backends.vllm_async import AsyncVLLM
+from torchrl.modules.llm import AsyncVLLM
 from torchrl.modules.llm.policies.common import (
     _batching,
     ChatHistory,
diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -282,12 +282,13 @@ def async_shutdown(
     ) -> None:
         """Shuts down the collector when started asynchronously with the `start` method.
 
-        Arg:
+        Args:
             timeout (float, optional): The maximum time to wait for the collector to shutdown.
             close_env (bool, optional): If True, the collector will close the contained environment.
                 Defaults to `True`.
 
         .. seealso:: :meth:`~.start`
+
         """
         return self.shutdown(timeout=timeout, close_env=close_env)
 
@@ -595,7 +596,7 @@ class SyncDataCollector(DataCollectorBase):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], optional): a callable that returns
@@ -2082,7 +2083,7 @@ class _MultiDataCollector(DataCollectorBase):
               ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
@@ -3278,8 +3279,8 @@ class MultiSyncDataCollector(_MultiDataCollector):
         ...         if i == 2:
         ...             print(data)
         ...             break
-        >>> collector.shutdown()
-        >>> del collector
+        ...     collector.shutdown()
+        ...     del collector
         TensorDict(
             fields={
                 action: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.float32, is_shared=False),
@@ -3665,8 +3666,8 @@ class MultiaSyncDataCollector(_MultiDataCollector):
         ...         if i == 2:
         ...             print(data)
         ...             break
-        ... collector.shutdown()
-        ... del collector
+        ...     collector.shutdown()
+        ...     del collector
         TensorDict(
             fields={
                 action: Tensor(shape=torch.Size([200, 1]), device=cpu, dtype=torch.float32, is_shared=False),
@@ -3901,7 +3902,7 @@ class aSyncDataCollector(MultiaSyncDataCollector):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], optional): a callable that returns
@@ -3915,8 +3916,8 @@ class aSyncDataCollector(MultiaSyncDataCollector):
             total number of frames returned by the collector
             during its lifespan. If the ``total_frames`` is not divisible by
             ``frames_per_batch``, an exception is raised.
-             Endless collectors can be created by passing ``total_frames=-1``.
-             Defaults to ``-1`` (never ending collector).
+            Endless collectors can be created by passing ``total_frames=-1``.
+            Defaults to ``-1`` (never ending collector).
         device (int, str or torch.device, optional): The generic device of the
             collector. The ``device`` args fills any non-specified device: if
             ``device`` is not ``None`` and any of ``storing_device``, ``policy_device`` or
diff --git a/torchrl/collectors/distributed/generic.py b/torchrl/collectors/distributed/generic.py
@@ -283,7 +283,7 @@ class DistributedDataCollector(DataCollectorBase):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
@@ -297,8 +297,8 @@ class DistributedDataCollector(DataCollectorBase):
             number of frames returned by the collector
             during its lifespan. If the ``total_frames`` is not divisible by
             ``frames_per_batch``, an exception is raised.
-             Endless collectors can be created by passing ``total_frames=-1``.
-             Defaults to ``-1`` (endless collector).
+            Endless collectors can be created by passing ``total_frames=-1``.
+            Defaults to ``-1`` (endless collector).
         device (int, str or torch.device, optional): The generic device of the
             collector. The ``device`` args fills any non-specified device: if
             ``device`` is not ``None`` and any of ``storing_device``, ``policy_device`` or
diff --git a/torchrl/collectors/distributed/ray.py b/torchrl/collectors/distributed/ray.py
@@ -132,7 +132,7 @@ class RayCollector(DataCollectorBase):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
diff --git a/torchrl/collectors/distributed/rpc.py b/torchrl/collectors/distributed/rpc.py
@@ -122,7 +122,7 @@ class RPCDataCollector(DataCollectorBase):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
@@ -136,8 +136,8 @@ class RPCDataCollector(DataCollectorBase):
             number of frames returned by the collector
             during its lifespan. If the ``total_frames`` is not divisible by
             ``frames_per_batch``, an exception is raised.
-             Endless collectors can be created by passing ``total_frames=-1``.
-             Defaults to ``-1`` (endless collector).
+            Endless collectors can be created by passing ``total_frames=-1``.
+            Defaults to ``-1`` (endless collector).
         device (int, str or torch.device, optional): The generic device of the
             collector. The ``device`` args fills any non-specified device: if
             ``device`` is not ``None`` and any of ``storing_device``, ``policy_device`` or
diff --git a/torchrl/collectors/distributed/sync.py b/torchrl/collectors/distributed/sync.py
@@ -158,7 +158,7 @@ class DistributedSyncDataCollector(DataCollectorBase):
             - In all other cases an attempt to wrap it will be undergone as such: ``TensorDictModule(policy, in_keys=env_obs_key, out_keys=env.action_keys)``.
 
             .. note:: If the policy needs to be passed as a policy factory (e.g., in case it mustn't be serialized /
-                pickled directly), the :arg:`policy_factory` should be used instead.
+                pickled directly), the ``policy_factory`` should be used instead.
 
     Keyword Args:
         policy_factory (Callable[[], Callable], list of Callable[[], Callable], optional): a callable
@@ -172,8 +172,8 @@ class DistributedSyncDataCollector(DataCollectorBase):
             number of frames returned by the collector
             during its lifespan. If the ``total_frames`` is not divisible by
             ``frames_per_batch``, an exception is raised.
-             Endless collectors can be created by passing ``total_frames=-1``.
-             Defaults to ``-1`` (endless collector).
+            Endless collectors can be created by passing ``total_frames=-1``.
+            Defaults to ``-1`` (endless collector).
         device (int, str or torch.device, optional): The generic device of the
             collector. The ``device`` args fills any non-specified device: if
             ``device`` is not ``None`` and any of ``storing_device``, ``policy_device`` or
diff --git a/torchrl/data/map/tree.py b/torchrl/data/map/tree.py
@@ -610,7 +610,7 @@ def to_string(self, node_format_fn=lambda tree: tree.node_data.to_dict()):
         This function can pull out information from each of the nodes in a tree,
         so it can be useful for debugging. The nodes are listed line-by-line.
         Each line contains the path to the node, followed by the string
-        representation of that node generated with :arg:`node_format_fn`. Each
+        representation of that node generated with ``node_format_fn``. Each
         line is indented according to number of steps in the path required to
         get to the corresponding node.
 
@@ -1370,7 +1370,7 @@ def to_string(self, td_root, node_format_fn=lambda tree: tree.node_data.to_dict(
         This function can pull out information from each of the nodes in a tree,
         so it can be useful for debugging. The nodes are listed line-by-line.
         Each line contains the path to the node, followed by the string
-        representation of that node generated with :arg:`node_format_fn`. Each
+        representation of that node generated with ``node_format_fn``. Each
         line is indented according to number of steps in the path required to
         get to the corresponding node.
 
diff --git a/torchrl/envs/custom/llm.py b/torchrl/envs/custom/llm.py
@@ -28,10 +28,10 @@ class LLMHashingEnv(EnvBase):
     The primary goal of this environment is to identify token chains using a hashing function.
     This allows the data to be stored in a :class:`~torchrl.data.MCTSForest` using nothing but hashes as node
     identifiers, or easily prune repeated token chains in a data structure.
-    The following figure gives an overview of this workflow:
 
-    .. figure:: /_static/img/rollout-llm.png
-        :alt: Data collection loop with our LLM environment.
+    .. The following figure gives an overview of this workflow:
+    .. .. figure:: /_static/img/rollout-llm.png
+    ..     :alt: Data collection loop with our LLM environment.
 
     Args:
         vocab_size (int): The size of the vocabulary. Can be omitted if the tokenizer is passed.
diff --git a/torchrl/envs/llm/envs.py b/torchrl/envs/llm/envs.py
@@ -601,10 +601,10 @@ class LLMHashingEnv(EnvBase):
     The primary goal of this environment is to identify token chains using a hashing function.
     This allows the data to be stored in a :class:`~torchrl.data.MCTSForest` using nothing but hashes as node
     identifiers, or easily prune repeated token chains in a data structure.
-    The following figure gives an overview of this workflow:
 
-    .. figure:: /_static/img/rollout-llm.png
-        :alt: Data collection loop with our LLM environment.
+    .. The following figure gives an overview of this workflow:
+    .. .. figure:: /_static/img/rollout-llm.png
+    ..     :alt: Data collection loop with our LLM environment.
 
     Args:
         vocab_size (int): The size of the vocabulary. Can be omitted if the tokenizer is passed.
diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -5431,8 +5431,8 @@ def get_input_from_hash(self, hash_tensor) -> Any:
         """Look up the input that was given for a particular hash output.
 
         This feature is only available if, during initialization, either the
-        :arg:`repertoire` argument was given or both the :arg:`in_keys_inv` and
-        :arg:`out_keys_inv` arguments were given.
+        ``repertoire`` argument was given or both the ``in_keys_inv`` and
+        ``out_keys_inv`` arguments were given.
 
         Args:
             hash_tensor (Tensor): The hash output.
diff --git a/torchrl/modules/distributions/discrete.py b/torchrl/modules/distributions/discrete.py
@@ -622,7 +622,7 @@ class Ordinal(D.Categorical):
     not impose any notion of proximity or ordering over its support's atoms.
     The `Ordinal` distribution explicitly encodes those concepts, which is
     useful for learning discrete sampling from continuous sets. See §5 of
-    `Tang & Agrawal, 2020<https://arxiv.org/pdf/1901.10500.pdf>`_ for details.
+    `Tang & Agrawal, 2020 <https://arxiv.org/pdf/1901.10500.pdf>`_ for details.
 
     .. note::
         This class is mostly useful when you want to learn a distribution over
diff --git a/torchrl/modules/llm/backends/_models.py b/torchrl/modules/llm/backends/_models.py
diff --git a/torchrl/modules/llm/backends/vllm/vllm_async.py b/torchrl/modules/llm/backends/vllm/vllm_async.py
@@ -512,7 +512,7 @@ class AsyncVLLM(RLvLLMEngine):
                 See `this issue <https://github.com/vllm-project/vllm/issues/8268>`_ for more details.
 
     Example:
-        >>> from torchrl.modules.llm.backends.vllm_async import AsyncVLLM
+        >>> from torchrl.modules.llm import AsyncVLLM
         >>> from vllm import SamplingParams
         >>>
         >>> # Simple usage - single GPU, single replica
diff --git a/torchrl/modules/llm/backends/vllm_plugin.py b/torchrl/modules/llm/backends/vllm_plugin.py
diff --git a/torchrl/modules/llm/policies/transformers_wrapper.py b/torchrl/modules/llm/policies/transformers_wrapper.py
diff --git a/torchrl/modules/llm/policies/vllm_wrapper.py b/torchrl/modules/llm/policies/vllm_wrapper.py
diff --git a/tutorials/sphinx-tutorials/llm_wrappers.py b/tutorials/sphinx-tutorials/llm_wrappers.py
diff --git a/tutorials/sphinx-tutorials/torchrl_envs.py b/tutorials/sphinx-tutorials/torchrl_envs.py