diff --git a/figures/ch6/ch6-world-model-flow-diagram.png b/figures/ch6/ch6-world-model-flow-diagram.png new file mode 100644 index 0000000..e881615 Binary files /dev/null and b/figures/ch6/ch6-world-model-flow-diagram.png differ diff --git a/main.bib b/main.bib index c1ae160..f355b66 100644 --- a/main.bib +++ b/main.bib @@ -2057,3 +2057,73 @@ @inproceedings{zhu2024minigpt author = {Zhu, Deyao and Chen, Jun and Shen, Xiaoqian and Li, Xiang and Elhoseiny, Mohamed}, year = {2024} } + +@article{https://doi.org/10.5281/zenodo.1207631, + doi = {10.5281/ZENODO.1207631}, + url = {https://zenodo.org/record/1207631}, + author = {Ha, David and Schmidhuber, Jürgen}, + title = {World Models}, + publisher = {Zenodo}, + year = {2018}, + copyright = {Creative Commons Attribution 4.0} +} + +@misc{hafner2020dreamcontrollearningbehaviors, + title={Dream to Control: Learning Behaviors by Latent Imagination}, + author={Danijar Hafner and Timothy Lillicrap and Jimmy Ba and Mohammad Norouzi}, + year={2020}, + eprint={1912.01603}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/1912.01603}, +} + +@misc{hafner2024masteringdiversedomainsworld, + title={Mastering Diverse Domains through World Models}, + author={Danijar Hafner and Jurgis Pasukonis and Jimmy Ba and Timothy Lillicrap}, + year={2024}, + eprint={2301.04104}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2301.04104}, +} + +@misc{barthmaron2018distributeddistributionaldeterministicpolicy, + title={Distributed Distributional Deterministic Policy Gradients}, + author={Gabriel Barth-Maron and Matthew W. Hoffman and David Budden and Will Dabney and Dan Horgan and Dhruva TB and Alistair Muldal and Nicolas Heess and Timothy Lillicrap}, + year={2018}, + eprint={1804.08617}, + archivePrefix={arXiv}, + primaryClass={cs.LG}, + url={https://arxiv.org/abs/1804.08617}, +} + +@misc{wu2022daydreamerworldmodelsphysical, + title={DayDreamer: World Models for Physical Robot Learning}, + author={Philipp Wu and Alejandro Escontrela and Danijar Hafner and Ken Goldberg and Pieter Abbeel}, + year={2022}, + eprint={2206.14176}, + archivePrefix={arXiv}, + primaryClass={cs.RO}, + url={https://arxiv.org/abs/2206.14176}, +} + +@misc{nvidia2025cosmosworldfoundationmodel, + title={Cosmos World Foundation Model Platform for Physical AI}, + author={NVIDIA and : and Niket Agarwal and Arslan Ali and Maciej Bala and Yogesh Balaji and Erik Barker and Tiffany Cai and Prithvijit Chattopadhyay and Yongxin Chen and Yin Cui and Yifan Ding and Daniel Dworakowski and Jiaojiao Fan and Michele Fenzi and Francesco Ferroni and Sanja Fidler and Dieter Fox and Songwei Ge and Yunhao Ge and Jinwei Gu and Siddharth Gururani and Ethan He and Jiahui Huang and Jacob Huffman and Pooya Jannaty and Jingyi Jin and Seung Wook Kim and Gergely Klár and Grace Lam and Shiyi Lan and Laura Leal-Taixe and Anqi Li and Zhaoshuo Li and Chen-Hsuan Lin and Tsung-Yi Lin and Huan Ling and Ming-Yu Liu and Xian Liu and Alice Luo and Qianli Ma and Hanzi Mao and Kaichun Mo and Arsalan Mousavian and Seungjun Nah and Sriharsha Niverty and David Page and Despoina Paschalidou and Zeeshan Patel and Lindsey Pavao and Morteza Ramezanali and Fitsum Reda and Xiaowei Ren and Vasanth Rao Naik Sabavat and Ed Schmerling and Stella Shi and Bartosz Stefaniak and Shitao Tang and Lyne Tchapmi and Przemek Tredak and Wei-Cheng Tseng and Jibin Varghese and Hao Wang and Haoxiang Wang and Heng Wang and Ting-Chun Wang and Fangyin Wei and Xinyue Wei and Jay Zhangjie Wu and Jiashu Xu and Wei Yang and Lin Yen-Chen and Xiaohui Zeng and Yu Zeng and Jing Zhang and Qinsheng Zhang and Yuxuan Zhang and Qingqing Zhao and Artur Zolkowski}, + year={2025}, + eprint={2501.03575}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2501.03575}, +} + +@misc{nvidia2025cosmosreason1physicalcommonsense, + title={Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning}, + author={NVIDIA and : and Alisson Azzolini and Junjie Bai and Hannah Brandon and Jiaxin Cao and Prithvijit Chattopadhyay and Huayu Chen and Jinju Chu and Yin Cui and Jenna Diamond and Yifan Ding and Liang Feng and Francesco Ferroni and Rama Govindaraju and Jinwei Gu and Siddharth Gururani and Imad El Hanafi and Zekun Hao and Jacob Huffman and Jingyi Jin and Brendan Johnson and Rizwan Khan and George Kurian and Elena Lantz and Nayeon Lee and Zhaoshuo Li and Xuan Li and Maosheng Liao and Tsung-Yi Lin and Yen-Chen Lin and Ming-Yu Liu and Xiangyu Lu and Alice Luo and Andrew Mathau and Yun Ni and Lindsey Pavao and Wei Ping and David W. Romero and Misha Smelyanskiy and Shuran Song and Lyne Tchapmi and Andrew Z. Wang and Boxin Wang and Haoxiang Wang and Fangyin Wei and Jiashu Xu and Yao Xu and Dinghao Yang and Xiaodong Yang and Zhuolin Yang and Jingxu Zhang and Xiaohui Zeng and Zhe Zhang}, + year={2025}, + eprint={2503.15558}, + archivePrefix={arXiv}, + primaryClass={cs.AI}, + url={https://arxiv.org/abs/2503.15558}, +} diff --git a/main.tex b/main.tex index f00599d..5e4224f 100644 --- a/main.tex +++ b/main.tex @@ -51,6 +51,9 @@ \newpage \input{sections/05_foundation_models.tex} +\newpage +\input{sections/06_next_directions.tex} + \newpage \input{sections/07_conclusions.tex} diff --git a/sections/06_next_directions.tex b/sections/06_next_directions.tex index ec4e5df..f9f39e1 100644 --- a/sections/06_next_directions.tex +++ b/sections/06_next_directions.tex @@ -1,8 +1,167 @@ -- Post training VLAs - - From Imitation to Refinement - - EXPO - -- World Models for robotics - - Cosmos - - World Models (1X) - - Sima and Genie 1 +\section{Some Emerging Directions in Robot Learning} +\label{sec:emerging-directions} + +\epigraph{\textit{The future belongs to those who prepare for it today.}}{Malcolm X} + +\subsection{World Models for Robotics} + +The surge in effective generalist robot policies, as discussed in the previous sections, has been largely driven by the integration of large-scale, pre-trained backbones into robot learning frameworks. +However, beyond the dominant supervised behavioral cloning paradigm that characterizes vision-language-action models, an emerging and complementary direction in robot learning revolves around \emph{world models}: learned generative or predictive models that enable agents to imagine future scenarios and plan actions within a compact, learned latent space. +World models represent a paradigm shift in how agents can approach robot learning and control, offering a promising pathway to address long-standing challenges including \emph{sample efficiency}, \emph{generalization across tasks and embodiments}, and \emph{long-horizon planning}. +This subsection provides an overview of world models for robotics, discusses their role in model-based reinforcement learning, and introduces recent advances in their scale and applicability through \emph{foundation world models}. + +\subsubsection{The Architecture of World Models} + +World models are typically composed of three interconnected neural network components~\citep{https://doi.org/10.5281/zenodo.1207631}, each addressing a distinct aspect of agent learning: + +The \textbf{Vision Model (V)} serves as the agent's perceptual system, compressing high-dimensional sensory inputs - typically RGB images from camera observations - into a low-dimensional, semantically meaningful \emph{latent representation}. +A Variational Autoencoder (VAE)~\citep{kingma2013auto} is commonly used for this purpose, learning to map observations $x_t$ into latent vectors $z_t \in \mathbb{R}^{d_z}$, where $d_z$ is much smaller than the dimensionality of raw images. +The VAE simultaneously learns a decoder to reconstruct the original observation from the latent code, ensuring that the compressed representation retains essential information for downstream control and prediction tasks. +This compression step is crucial: it enables the agent to work in a tractable latent space where predictions can be made efficiently without requiring explicit pixel-level reconstructions. + +The \textbf{Memory Model (M)}, sometimes referred to as the \emph{dynamics model} or \emph{transition model}, learns to predict future latent states given the current latent state and action taken by the agent. +In early formulations~\citep{https://doi.org/10.5281/zenodo.1207631}, this was implemented as a Mixture Density Network RNN (MDN-RNN), which learns a stochastic dynamics model $p(z_{t+1} | z_t, a_t)$ representing the conditional distribution over future states. +More recently, Recurrent State-Space Models (RSSMs) have become the de facto standard for this component~\citep{hafner2020dreamcontrollearningbehaviors,hafner2024masteringdiversedomainsworld}. +In an RSSM, the agent maintains both a \emph{deterministic recurrent state} $h_t$ and a \emph{stochastic latent state} $z_t$, enabling the model to capture both the deterministic evolution and inherent stochasticity of environment dynamics. +The model is trained to minimize the reconstruction error of past observations and the prediction error of future rewards, jointly learning a rich representation of environment dynamics. + +The \textbf{Controller (C)} is the agent's decision-making component, which maps from the agent's current latent state to actions to be taken. +Early work on world models~\citep{https://doi.org/10.5281/zenodo.1207631} employed evolutionary algorithms such as CMA-ES (Covariance Matrix Adaptation Evolution Strategy) to optimize the controller parameters, treating the learned world model as a fixed environment simulator. +However, modern approaches employ differentiable controllers - typically neural networks parametrized as actor and critic functions - that can be optimized via backpropagation through imagined trajectories generated by the learned world model. + +\begin{figure} + \centering + \includegraphics[width=0.6\textwidth]{figures/ch6/ch6-world-model-flow-diagram.png} + \caption{The flow diagram illustrating how V, M, and C interact with the environment.} + \label{fig:ch6-world-model-flow-diagram} +\end{figure} + +\subsubsection{Imagined Rollouts and Planning in Latent Space} + +The key insight enabling world models to tackle long-horizon control tasks is that once a world model is learned, the agent can generate \emph{imagined trajectories} in the compact latent space without requiring interactions with the physical environment. +This enables the agent to: +(1) train behaviors by backpropagating through multiple imagined steps in the learned dynamics model - a process often called \emph{latent imagination}~\citep{hafner2020dreamcontrollearningbehaviors}; +(2) plan actions by simulating multiple action sequences and selecting the sequence with the highest predicted return; +and (3) leverage the learned latent dynamics to provide rich learning signals for training policies through model-based reinforcement learning. + +A seminal contribution in this direction is the \textbf{Dreamer} algorithm~\citep{hafner2020dreamcontrollearningbehaviors}, which learns behaviors purely from imagined rollouts in the latent space of a learned world model. +Rather than requiring the controller to be learned via evolutionary algorithms, Dreamer introduces an actor-critic framework where the actor (policy) and critic (value function) are trained via backpropagation through trajectories imagined by the world model. +Critically, Dreamer demonstrates that this approach achieves comparable or superior data-efficiency to prior model-free methods like PPO~\citep{schulmanProximalPolicyOptimization2017} or D4PG~\citep{barthmaron2018distributeddistributionaldeterministicpolicy}, despite requiring no direct environment interaction for policy optimization. +This is possible because the model enables \emph{massively parallel} imagination of trajectories, allowing the agent to generate thousands of imagined rollouts from a single real environment interaction. + +\subsubsection{DreamerV3: Scaling World Models Across Diverse Domains} + +While early world model approaches showed promise on vision-based control tasks, the challenge of achieving generality across diverse domains has remained. +The third generation of the Dreamer algorithm - \textbf{DreamerV3}~\citep{hafner2024masteringdiversedomainsworld} - represents a landmark achievement in demonstrating that a \emph{single, fixed hyperparameter configuration} can outperform specialized expert algorithms across more than 150 diverse tasks, ranging from continuous control to game-playing to robotic navigation. +DreamerV3 achieves this unprecedented generality through a combination of architectural innovations and careful hyperparameter choices, including: + +\begin{enumerate} + \item \emph{Latent overshooting}, which regularizes the latent dynamics model by predicting multiple steps ahead and comparing predictions across different prediction horizons, mitigating error accumulation over long imagined trajectories. + \item \emph{Balanced loss weighting} across reconstruction, dynamics, and reward prediction objectives, ensuring that no single loss dominates during training and that the model learns a balanced representation. + \item \emph{Categorical distributions} for modeling discrete and continuous variables, replacing earlier choices of Gaussian distributions which could lead to high-variance gradients. + \item \emph{Architectural choices} such as layer normalization and gating mechanisms, which together improve training stability across diverse domains. +\end{enumerate} + +A landmark result from DreamerV3 is being the first algorithm to collect diamonds in Minecraft - a long-horizon, sparse-reward environment requiring hierarchical reasoning - from scratch without human demonstrations or curricula. +This achievement underscores the capability of world models to solve complex, open-ended tasks by learning to imagine and plan over extended horizons. + +\subsubsection{World Models and Cross-Embodiment Generalization} + +Beyond sample efficiency, world models also enable cross-embodiment generalization when trained on diverse, multi-embodiment data. +\cite{wu2022daydreamerworldmodelsphysical} applied the Dreamer framework to real robots across four different embodiments (manipulation tasks and locomotion tasks), demonstrating that a single world model trained on diverse experience could be used to learn behaviors for previously unseen tasks without additional environment interaction. +This cross-embodiment capability emerges naturally from world models trained on diverse data: by learning to predict dynamics in a latent space, the model captures principles of physics and interaction that generalize across different morphologies, provided that the training data is diverse enough to expose the model to such variations. + +\subsubsection{Challenges and Limitations of Traditional World Models} + +Despite their promise, scaling world models to handle complex, real-world robotics scenarios has proven challenging. +The primary limitations include: + +\begin{enumerate} + \item \textbf{Error accumulation}: Errors in the learned dynamics model compound over long prediction horizons. While techniques like latent overshooting help mitigate this, long-horizon planning (e.g., predicting thousands of steps) remains difficult. + \item \textbf{Diversity of interactions}: Real-world environments exhibit high stochasticity and multimodality (e.g., a robot pushing an object might succeed or fail depending on subtle conditions), and capturing this diversity in a learned model is challenging. + \item \textbf{Partial observability}: Many real-world scenarios involve partial observability (hidden state), requiring the world model to maintain uncertainty estimates or reason about unobserved variables - a problem that remains open. +\end{enumerate} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +\subsection{Cosmos World Model} + +Recent advances in scaling and pre-training have given rise to a new paradigm: \emph{foundation world models}, large-scale generative models trained on diverse, internet-scale video datasets that can be fine-tuned to specific robotics tasks. +One of the most prominent examples of this paradigm is \textbf{Cosmos}~\citep{nvidia2025cosmosworldfoundationmodel}, a world foundation model platform developed by NVIDIA. +Cosmos represents a significant step toward democratizing world model development by providing openly-available, pre-trained models that can be customized for downstream applications without requiring researchers to train from scratch. + +\subsubsection{Cosmos Architecture and Components} + +Cosmos is not a single model, but rather a \emph{platform} consisting of multiple complementary components, each designed to support different aspects of physical AI development. +The platform centers around two primary families of world foundation models: + +\textbf{Cosmos-Predict}~\citep{nvidia2025cosmosworldfoundationmodel} is a \emph{generative video model} designed to predict future video frames given multimodal inputs. +Unlike traditional world models which operate in latent space, Cosmos-Predict generates \emph{direct video predictions}, creating photorealistic synthetic video frames that depict how a scene will evolve. +Cosmos-Predict comes in two architectural variants: + +\begin{itemize} + \item \textbf{Diffusion-based models}: These models operate by iteratively removing noise from random Gaussian video, gradually refining the prediction through a sequence of denoising steps. The diffusion-based approach provides superior visual quality and physical plausibility, making it particularly suitable for applications requiring high-fidelity synthetic data generation. + \item \textbf{Autoregressive models}: These models predict video frames sequentially, generating one token at a time conditioned on past generations. The autoregressive approach enables real-time, low-latency generation with minimal computational overhead at inference time, making it amenable for deployed robotic systems requiring foresight over prediction horizons of tens to hundreds of frames. +\end{itemize} + +Both model families are trained on approximately \textbf{9 trillion tokens} extracted from \textbf{20 million hours} of real-world video data, including driving videos, robotics demonstrations, human activities, and nature footage. +This massive scale of pre-training on diverse video data enables the models to learn generalizable representations of physics and world dynamics that transfer effectively to downstream tasks. +Models are available in multiple capacity tiers - \emph{Nano} for edge deployment, \emph{Super} for high-performance baseline applications, and \emph{Ultra} for maximum quality- allowing developers to trade off between model size, inference latency, and output quality based on their application requirements. + +\textbf{Cosmos-Reason}~\citep{nvidia2025cosmosworldfoundationmodel,nvidia2025cosmosreason1physicalcommonsense} is a specialized \emph{vision-language model} designed specifically for physical AI and robotics. +Unlike Cosmos-Predict which focuses on visual prediction, Cosmos-Reason is trained to perform \emph{spatial-temporal reasoning} over video and images, enabling it to understand and predict the \emph{outcomes} of physical interactions without explicitly generating pixel-perfect predictions. +Cosmos-Reason is a 7-billion-parameter model that uses chain-of-thought reasoning to: +(1) understand how physical objects interact based on physics principles; +(2) predict the consequences of actions (e.g., ``if I push this object, it will fall"); +(3) serve as a \emph{planning model} that reasons about multi-step action sequences to achieve goals; +and (4) provide natural-language explanations of predicted outcomes. + +Cosmos-Reason is post-trained with \emph{physical common sense} and \emph{embodied reasoning data} through supervised fine-tuning and reinforcement learning, enabling it to reason about scenarios that deviate from the training distribution and handle the long tail of diverse real-world situations. +This makes Cosmos-Reason particularly valuable as a high-level planning module that can be paired with lower-level control policies to reason about task decomposition and action planning. + +\subsubsection{Data Curation and Pre-training at Scale} + +A cornerstone of Cosmos is the extensive \emph{video data curation pipeline} developed to extract high-quality, physics-rich training data from the massive raw video dataset. +The pipeline~\citep{nvidia2025cosmosworldfoundationmodel} consists of several stages: + +\begin{itemize} + \item \textbf{Shot detection and splitting}: Videos are automatically split at scene transitions to ensure visually consistent clips, removing abrupt edits that would confuse the model's understanding of physics. + \item \textbf{Motion filtering}: Videos are classified by camera motion type (pan, zoom, static, etc.) and tagged accordingly. Primarily static or erratic videos are filtered out, as they provide limited learning signal for world dynamics. + \item \textbf{Visual quality filtering}: Videos with compression artifacts, blur, low resolution, or other quality issues are removed using learned quality assessment models, ensuring that training data consists of high-fidelity observations. + \item \textbf{Semantic deduplication}: Duplicate or near-duplicate videos are identified using learned visual embeddings and removed, creating a diverse training set and reducing training time. + \item \textbf{Video type filtering and balancing}: Videos are classified into categories (driving, robotics, human motion, nature, etc.) and the dataset is balanced to ensure diverse coverage of physical phenomena and interaction types. + \item \textbf{Annotation}: Each video clip is automatically annotated with natural language captions using a large vision-language model, providing text conditioning information for text-to-video generation. +\end{itemize} + +This pipeline processes approximately 20 million hours of raw video and outputs roughly \textbf{100 million clips} suitable for training world foundation models. +The computational infrastructure leverages thousands of NVIDIA GPUs and specializes in maximizing throughput: the platform can process 20 million hours of video in just 40 days on NVIDIA Hopper GPUs (or 14 days on newer Blackwell GPUs), compared to over three years on conventional CPU systems. + +\subsubsection{Evaluation and Limitations of Foundation World Models} + +Evaluating world foundation models is challenging because unlike supervised learning tasks, there is no single ground-truth metric of success. +Rather, evaluation must assess whether the model captures the underlying physics and dynamics accurately enough to support downstream applications. +Cosmos evaluation protocols~\citep{nvidia2025cosmosworldfoundationmodel} address this through: + +\begin{itemize} + \item \textbf{3D consistency}: Evaluates whether predicted videos maintain geometric and spatial consistency across viewpoints. Metrics include Sampson error (measuring camera pose consistency) and pose estimation accuracy on predicted frames. + \item \textbf{Physics alignment}: Tests whether Cosmos predictions respect fundamental physical laws. Controlled experiments (e.g., dropping objects, simulating gravity) assess whether the model accurately predicts object trajectories, collisions, and inertial properties. +\end{itemize} + +However, current world foundation models exhibit important limitations: + +\begin{itemize} + \item \textbf{Object impermanence}: Models sometimes generate frames where objects vanish or reappear unexpectedly, indicating incomplete reasoning about object permanence. + \item \textbf{Implausible dynamics}: Occasionally, generated frames violate physical laws (e.g., objects floating without support), suggesting that even large-scale pre-training does not guarantee perfect physics understanding. + \item \textbf{Error accumulation}: Errors compound over long prediction horizons, limiting the planning horizon over which world models can reliably predict. + \item \textbf{Computational latency}: While autoregressive Cosmos models enable real-time generation, diffusion-based models require multiple denoising steps, introducing latency that may limit applicability for real-time robotic control. +\end{itemize} + + +% TODO: +% - Post training VLAs +% - From Imitation to Refinement +% - EXPO + +% - World Models for robotics +% - Cosmos +% - World Models (1X) +% - Sima and Genie 1