ai-dynamo · milesial · Nov 10, 2025 · Nov 8, 2025 · Nov 10, 2025 · Nov 11, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
@@ -24,6 +24,7 @@ testing-etcd = []
 block-manager = ["dep:nixl-sys", "dep:cudarc", "dep:nix", "dep:aligned-vec"]
 cuda = ["dep:cudarc"]
 integration = ["dynamo-runtime/integration"]
+media-nixl = ["dep:nixl-sys", "dep:dynamo-memory"]
 
 [[bench]]
 name = "tokenizer"
@@ -33,9 +34,11 @@ harness = false
 name = "transfer_context_v2"
 harness = false
 required-features = ["block-manager", "testing-cuda"]
+
 [dependencies]
 # repo
 dynamo-runtime = { workspace = true }
+dynamo-memory = { path = "../memory", version = "0.7.0", optional = true }
 
 # workspace
 aho-corasick = "1.1"
@@ -145,7 +148,7 @@ json-five = { version = "0.3" }
 # media loading in the preprocessor
 reqwest = { workspace = true }
 base64 = { version = "0.22" }
-image = { version = "0.25" }
+image = { version = "0.25", features = ["serde"] }
 tokio-rayon = {version = "2" }
 ndarray = { version = "0.16" }
 

@@ -27,7 +27,8 @@ use std::{collections::HashMap, pin::Pin, sync::Arc};
 use tracing;
 
 use crate::model_card::{ModelDeploymentCard, ModelInfo};
-use crate::preprocessor::media::MediaLoader;
+#[cfg(feature = "media-nixl")]
+use crate::preprocessor::media::{MediaDecoder, MediaFetcher, MediaLoader};
 use crate::preprocessor::prompt::OAIChatLikeRequest;
 use crate::protocols::common::preprocessor::{
     MultimodalData, MultimodalDataMap, PreprocessedRequestBuilder,
@@ -114,6 +115,7 @@ pub struct OpenAIPreprocessor {
     /// Per-model runtime configuration propagated to response generator (e.g., reasoning/tool parser)
     runtime_config: crate::local_model::runtime_config::ModelRuntimeConfig,
     tool_call_parser: Option<String>,
+    #[cfg(feature = "media-nixl")]
     media_loader: Option<MediaLoader>,
 }
 
@@ -143,14 +145,21 @@ impl OpenAIPreprocessor {
 
         // // Initialize runtime config from the ModelDeploymentCard
         let runtime_config = mdc.runtime_config.clone();
-        let media_loader = None; // TODO: enable with decoder config from MDC
+
+        #[cfg(feature = "media-nixl")]
+        let media_loader = match mdc.media_decoder {
+            Some(media_decoder) => Some(MediaLoader::new(media_decoder, mdc.media_fetcher)?),
+            None => None,
+        };
+
         Ok(Arc::new(Self {
             formatter,
             tokenizer,
             model_info,
             mdcsum,
             runtime_config,
             tool_call_parser,
+            #[cfg(feature = "media-nixl")]
             media_loader,
         }))
     }
@@ -279,7 +288,9 @@ impl OpenAIPreprocessor {
         let messages = request.messages();
         let message_count = messages.len().unwrap_or(0);
         let mut media_map: MultimodalDataMap = HashMap::new();
-        let mut fetch_tasks = Vec::new();
+        #[cfg(feature = "media-nixl")]
+        let mut fetch_tasks: Vec<(String, ChatCompletionRequestUserMessageContentPart)> =
+            Vec::new();
 
         for idx in 0..message_count {
             let msg = messages
@@ -312,29 +323,39 @@ impl OpenAIPreprocessor {
                     _ => continue,
                 };
 
+                #[cfg(feature = "media-nixl")]
                 if self.media_loader.is_some() {
                     fetch_tasks.push((type_str, content_part.clone()));
-                } else {
-                    // No loader, just pass the URL through
-                    media_map
-                        .entry(type_str)
-                        .or_default()
-                        .push(MultimodalData::Url(url));
+                    continue;
                 }
+
+                //Fallback: ust pass the URL through
+                media_map
+                    .entry(type_str)
+                    .or_default()
+                    .push(MultimodalData::Url(url));
             }
         }
 
         // Execute all fetch tasks
+        #[cfg(feature = "media-nixl")]
         if !fetch_tasks.is_empty() {
             let loader = self.media_loader.as_ref().unwrap();
-            let _results = futures::future::join_all(
+            let results = futures::future::join_all(
                 fetch_tasks
                     .iter()
                     .map(|(_, content_part)| loader.fetch_and_decode_media_part(content_part)),
             )
             .await;
 
-            // TODO: decode and pass NIXL descriptors to the media map
+            for ((type_str, _), result) in fetch_tasks.into_iter().zip(results.into_iter()) {
+                // if one item fails, errors the whole request, other items will be cleaned up by Drop
+                let rdma_descriptor = result?;
+                media_map
+                    .entry(type_str)
+                    .or_default()
+                    .push(MultimodalData::Decoded(rdma_descriptor));
+            }
         }
 
         if !media_map.is_empty() {

@@ -4,7 +4,12 @@
 mod common;
 mod decoders;
 mod loader;
+mod rdma;
 
 pub use common::EncodedMediaData;
 pub use decoders::{Decoder, ImageDecoder, MediaDecoder};
 pub use loader::{MediaFetcher, MediaLoader};
+
+pub use rdma::{DecodedMediaData, RdmaMediaDataDescriptor};
+#[cfg(feature = "media-nixl")]
+pub use rdma::{get_nixl_agent, get_nixl_metadata};
diff --git a/lib/llm/src/preprocessor/media/README.md b/lib/llm/src/preprocessor/media/README.md
@@ -0,0 +1,63 @@
+# Media decoding in the frontend
+
+
+This component performs media download, base64 decoding, media decoding and NIXL registration. Today, this is used in the OpenAI preprocessor, to transform multimodal inputs (image_url, video_url, audio_url) into fully decoded data (pixel values, ...) accessible to the backends via NIXL.
+
+## Usage
+
+Media decoding is enabled when registering the MDC:
+
+Set HTTP download options:
+
+```python
+from dynamo.llm import MediaFetcher
+fetcher = MediaFetcher()
+fetcher.user_agent("dynamo")
+fetcher.timeout_ms(15000)
+fetcher.allow_direct_ip(True)
+fetcher.allow_direct_port(False)
+fetcher.allowed_media_domains(["google.com"])
+```
+
+Set media decoding options:
+
+```python
+from dynamo.llm import MediaDecoder
+decoder = MediaDecoder()
+decoder.image_decoder({"max_image_width": 4096, "max_image_height": 4096, "max_alloc": 16*1024*1024})
+```
+
+And register the LLM as usual, adding the media configuration:
+
+```python
+register_llm(
+  ...,
+  media_decoder=decoder,
+  media_fetcher=fetcher,
+)
+```
+
+
+## TODOs
+
+### Modalities
+
+- [x] Image decoding
+- [ ] Video decoding
+- [ ] Audio decoding
+
+### Performance
+
+- [x] Image SW decoding
+- [ ] Video HW decoding (NVDEC)
+- [ ] JPEG HW decoding (nvJPEG)
+- [ ] Sparse video sampling (seek-forward)
+- [ ] Memory slab pre-allocation/registration
+
+### Memory management
+- [ ] Memory spilling to lower storage tiers
+- [ ] Early-free memory on client notifications
+
+### Misc
+- [ ] Observability on performance, memory usage and input distributions
+- [ ] Per-request decoding options
@@ -2,52 +2,14 @@
 // SPDX-License-Identifier: Apache-2.0
 
 use anyhow::Result;
+use serde::{Deserialize, Serialize};
 
 use super::common::EncodedMediaData;
-use ndarray::{ArrayBase, Dimension, OwnedRepr};
-mod image;
+use super::rdma::DecodedMediaData;
+pub mod image;
 
 pub use image::{ImageDecoder, ImageMetadata};
 
-#[derive(Debug)]
-pub enum DecodedMediaMetadata {
-    #[allow(dead_code)] // used in followup MR
-    Image(ImageMetadata),
-}
-
-#[derive(Debug, PartialEq, Eq)]
-pub enum DataType {
-    UINT8,
-}
-
-// Decoded media data (image RGB, video frames pixels, ...)
-#[derive(Debug)]
-pub struct DecodedMediaData {
-    #[allow(dead_code)] // used in followup MR
-    pub(crate) data: Vec<u8>,
-    #[allow(dead_code)] // used in followup MR
-    pub(crate) shape: Vec<usize>,
-    #[allow(dead_code)] // used in followup MR
-    pub(crate) dtype: DataType,
-    #[allow(dead_code)] // used in followup MR
-    pub(crate) metadata: Option<DecodedMediaMetadata>,
-}
-
-// convert Array{N}<u8> to DecodedMediaData
-// TODO: Array1<f32> for audio
-impl<D: Dimension> From<ArrayBase<OwnedRepr<u8>, D>> for DecodedMediaData {
-    fn from(array: ArrayBase<OwnedRepr<u8>, D>) -> Self {
-        let shape = array.shape().to_vec();
-        let (data, _) = array.into_raw_vec_and_offset();
-        Self {
-            data,
-            shape,
-            dtype: DataType::UINT8,
-            metadata: None,
-        }
-    }
-}
-
 #[async_trait::async_trait]
 pub trait Decoder: Clone + Send + 'static {
     fn decode(&self, data: EncodedMediaData) -> Result<DecodedMediaData>;
@@ -67,3 +29,8 @@ pub struct MediaDecoder {
     pub image_decoder: ImageDecoder,
     // TODO: video, audio decoders
 }
+
+#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+pub enum DecodedMediaMetadata {
+    Image(ImageMetadata),
+}