diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go
index 96ce0ab..ba0eb50 100644
--- a/ggml/llamacpp/llamacpp.go
+++ b/ggml/llamacpp/llamacpp.go
@@ -12,7 +12,7 @@ package llamacpp
 #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common
 #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64
 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
-#cgo darwin LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
+#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
 #include <stdlib.h>
 #include <stdbool.h>
 #include "wrapper.h"
@@ -317,6 +317,80 @@ func (c *Context) GetEmbeddings(text string) ([]float32, error) {
 	return out[:n], nil
 }
 
+// GenerateStreamWithImages runs multimodal generation with images, streaming tokens via callback.
+// mmprojPath is the path to the multimodal projector GGUF file.
+// images is a slice of raw image bytes (JPEG/PNG).
+// Return false from cb to stop generation.
+func (c *Context) GenerateStreamWithImages(prompt string, images [][]byte, mmprojPath string, cb func(token string) bool, opts ...GenerateOption) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.c == nil || c.model.c == nil {
+		return errors.New("context or model is closed")
+	}
+
+	cfg := defaultGenerateConfig()
+	for _, o := range opts {
+		o(&cfg)
+	}
+
+	cprompt := C.CString(prompt)
+	defer C.free(unsafe.Pointer(cprompt))
+
+	cmmproj := C.CString(mmprojPath)
+	defer C.free(unsafe.Pointer(cmmproj))
+
+	params := C.go_llama_default_generate_params()
+	params.max_tokens = C.int(cfg.maxTokens)
+	params.temperature = C.float(cfg.temperature)
+	params.top_k = C.int(cfg.topK)
+	params.top_p = C.float(cfg.topP)
+	params.min_p = C.float(cfg.minP)
+	params.repeat_penalty = C.float(cfg.repeatPenalty)
+	params.freq_penalty = C.float(cfg.freqPenalty)
+	params.presence_penalty = C.float(cfg.presencePenalty)
+	params.seed = C.int(cfg.seed)
+	params.penalty_last_n = C.int(cfg.penaltyLastN)
+
+	// Build C array of go_llama_image.
+	var cImages *C.go_llama_image
+	if len(images) > 0 {
+		cImagesSlice := make([]C.go_llama_image, len(images))
+		for i, img := range images {
+			cImagesSlice[i].data = (*C.uchar)(unsafe.Pointer(&img[0]))
+			cImagesSlice[i].size = C.int(len(img))
+		}
+		cImages = &cImagesSlice[0]
+	}
+
+	state := &streamState{cb: cb}
+	handle := registerCallback(state)
+	defer unregisterCallback(handle)
+
+	rc := C.go_llama_generate_with_images(
+		unsafe.Pointer(c.c),
+		unsafe.Pointer(c.model.c),
+		cmmproj,
+		cprompt,
+		cImages,
+		C.int(len(images)),
+		params,
+		C.go_llama_token_callback(C.goTokenCallbackBridge),
+		unsafe.Pointer(handle),
+	)
+
+	if rc != 0 {
+		errMsg := C.GoString(C.go_llama_last_error())
+		return fmt.Errorf("generate with images failed: %s", errMsg)
+	}
+	return nil
+}
+
+// FreeMTMD frees the cached multimodal context. Call on shutdown.
+func FreeMTMD() {
+	C.go_llama_mtmd_free()
+}
+
 // Close frees the context resources.
 func (c *Context) Close() {
 	c.mu.Lock()
diff --git a/ggml/llamacpp/llamacpp_android.go b/ggml/llamacpp/llamacpp_android.go
index 1e2265f..f1b95b2 100644
--- a/ggml/llamacpp/llamacpp_android.go
+++ b/ggml/llamacpp/llamacpp_android.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64
-#cgo android LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
+#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux.go b/ggml/llamacpp/llamacpp_linux.go
index 0d79d74..a35551a 100644
--- a/ggml/llamacpp/llamacpp_linux.go
+++ b/ggml/llamacpp/llamacpp_linux.go
@@ -8,6 +8,6 @@ package llamacpp
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_cuda.go b/ggml/llamacpp/llamacpp_linux_cuda.go
index 2e8578d..d7c8fd5 100644
--- a/ggml/llamacpp/llamacpp_linux_cuda.go
+++ b/ggml/llamacpp/llamacpp_linux_cuda.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_vulkan.go b/ggml/llamacpp/llamacpp_linux_vulkan.go
index b15c767..97fea4b 100644
--- a/ggml/llamacpp/llamacpp_linux_vulkan.go
+++ b/ggml/llamacpp/llamacpp_linux_vulkan.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/third_party/include/mtmd-helper.h b/ggml/llamacpp/third_party/include/mtmd-helper.h
new file mode 100644
index 0000000..5036b92
--- /dev/null
+++ b/ggml/llamacpp/third_party/include/mtmd-helper.h
@@ -0,0 +1,96 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+// Note: this also call mtmd_log_set() internally
+MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif
diff --git a/ggml/llamacpp/third_party/include/mtmd.h b/ggml/llamacpp/third_party/include/mtmd.h
new file mode 100644
index 0000000..ebb4a18
--- /dev/null
+++ b/ggml/llamacpp/third_party/include/mtmd.h
@@ -0,0 +1,319 @@
+#ifndef MTMD_H
+#define MTMD_H
+
+#include "ggml.h"
+#include "llama.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+#include <string>
+#include <vector>
+#include <cinttypes>
+#include <memory>
+#endif
+
+/**
+ * libmtmd: A library for multimodal support in llama.cpp.
+ *
+ * WARNING: This API is experimental and subject to many BREAKING CHANGES.
+ *          Issues related to API usage may receive lower priority support.
+ *
+ * For the usage, see an example in mtmd-cli.cpp
+ *
+ * For contributors:
+ * - Make sure the C API is aligned with the libllama C API (as in llama.h)
+ * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead
+ * - Keep the API minimal, do not expose internal details unless necessary
+ *
+ * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated.
+ * We encourage human contributors to ensure the quality and reliability of the codebase.
+ */
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define MTMD_API __declspec(dllexport)
+#        else
+#            define MTMD_API __declspec(dllimport)
+#        endif
+#    else
+#        define MTMD_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define MTMD_API
+#endif
+
+// deprecated marker, use mtmd_default_marker() instead
+#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+enum mtmd_input_chunk_type {
+    MTMD_INPUT_CHUNK_TYPE_TEXT,
+    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    MTMD_INPUT_CHUNK_TYPE_AUDIO,
+};
+
+// opaque types
+struct mtmd_context;
+struct mtmd_bitmap;
+struct mtmd_image_tokens;
+struct mtmd_input_chunk;
+struct mtmd_input_chunks;
+
+struct mtmd_input_text {
+    const char * text;
+    bool add_special;
+    bool parse_special;
+};
+
+//
+// C API
+//
+
+typedef struct mtmd_context      mtmd_context;
+typedef struct mtmd_bitmap       mtmd_bitmap;
+typedef struct mtmd_image_tokens mtmd_image_tokens;
+typedef struct mtmd_input_chunk  mtmd_input_chunk;
+typedef struct mtmd_input_chunks mtmd_input_chunks;
+typedef struct mtmd_input_text   mtmd_input_text;
+
+struct mtmd_context_params {
+    bool use_gpu;
+    bool print_timings;
+    int n_threads;
+    const char * image_marker; // deprecated, use media_marker instead
+    const char * media_marker;
+    enum llama_flash_attn_type flash_attn_type;
+    bool warmup; // whether to run a warmup encode pass after initialization
+
+    // limit number of image tokens, only for vision models with dynamic resolution
+    int image_min_tokens; // minimum number of tokens for image input (default: read from metadata)
+    int image_max_tokens; // maximum number of tokens for image input (default: read from metadata)
+
+    // callback function passed over to mtmd proper
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
+};
+
+MTMD_API const char * mtmd_default_marker(void);
+
+MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
+
+// initialize the mtmd context
+// return nullptr on failure
+MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
+                                            const struct llama_model * text_model,
+                                            const struct mtmd_context_params ctx_params);
+
+MTMD_API void mtmd_free(mtmd_context * ctx);
+
+// whether we need to set non-causal mask before llama_decode
+MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
+// whether the current model supports vision input
+MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+
+// whether the current model supports audio input
+MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+
+// get audio sample rate in Hz, for example 16000 for Whisper
+// return -1 if audio is not supported
+MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
+
+// mtmd_bitmap
+//
+// if bitmap is image:
+//     length of data must be nx * ny * 3
+//     the data is in RGBRGBRGB... format
+// if bitmap is audio:
+//     length of data must be n_samples * sizeof(float)
+//     the data is in float format (PCM F32)
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init           (uint32_t nx, uint32_t ny, const unsigned char * data);
+MTMD_API mtmd_bitmap *         mtmd_bitmap_init_from_audio(size_t n_samples,         const float         * data);
+MTMD_API uint32_t              mtmd_bitmap_get_nx     (const mtmd_bitmap * bitmap);
+MTMD_API uint32_t              mtmd_bitmap_get_ny     (const mtmd_bitmap * bitmap);
+MTMD_API const unsigned char * mtmd_bitmap_get_data   (const mtmd_bitmap * bitmap);
+MTMD_API size_t                mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap);
+MTMD_API bool                  mtmd_bitmap_is_audio   (const mtmd_bitmap * bitmap);
+MTMD_API void                  mtmd_bitmap_free       (mtmd_bitmap * bitmap);
+// bitmap ID is optional, but useful for KV cache tracking
+// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data()
+MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap);
+MTMD_API void         mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id);
+
+
+// mtmd_input_chunks
+//
+// this is simply a list of mtmd_input_chunk
+// the elements can only be populated via mtmd_tokenize()
+MTMD_API mtmd_input_chunks *      mtmd_input_chunks_init(void);
+MTMD_API size_t                   mtmd_input_chunks_size(const mtmd_input_chunks * chunks);
+MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx);
+MTMD_API void                     mtmd_input_chunks_free(mtmd_input_chunks * chunks);
+
+// mtmd_input_chunk
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunks
+MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type        (const mtmd_input_chunk * chunk);
+MTMD_API const llama_token *        mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output);
+MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
+MTMD_API size_t                     mtmd_input_chunk_get_n_tokens    (const mtmd_input_chunk * chunk);
+// returns nullptr for ID on text chunk
+MTMD_API const char *               mtmd_input_chunk_get_id          (const mtmd_input_chunk * chunk);
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos                  mtmd_input_chunk_get_n_pos       (const mtmd_input_chunk * chunk);
+
+// in case you want to use custom logic to handle the chunk (i.e. KV cache management)
+// you can move the chunk ownership to your own code by copying it
+// remember to free the chunk when you are done with it
+MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk);
+MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
+
+
+// mtmd_image_tokens
+//
+// the instance will be constructed via mtmd_tokenize()
+// it will be freed along with mtmd_input_chunk
+MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
+MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
+MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
+MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
+MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
+
+// tokenize an input text prompt and a list of bitmaps (images/audio)
+// the prompt must have the input image marker (default: "<__media__>") in it
+// the default marker is defined by mtmd_default_marker()
+// the marker will be replaced with the image/audio chunk
+// for example:
+//   "here is an image: <__media__>\ndescribe it in detail."
+//   this will gives 3 chunks:
+//   1. "here is an image: <start_of_image>"
+//   2. (image/audio tokens)
+//   3. "<end_of_image>\ndescribe it in detail."
+// number of bitmaps must be equal to the number of markers in the prompt
+// this function is thread-safe (shared ctx)
+// return values:
+//   0 on success
+//   1 on number of bitmaps not matching the number of markers
+//   2 on image preprocessing error
+MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx,
+                               mtmd_input_chunks * output,
+                               const mtmd_input_text * text,
+                               const mtmd_bitmap ** bitmaps,
+                               size_t n_bitmaps);
+
+// returns 0 on success
+// TODO: deprecate
+MTMD_API int32_t mtmd_encode(mtmd_context * ctx,
+                             const mtmd_image_tokens * image_tokens);
+
+// returns 0 on success
+MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
+                                   const mtmd_input_chunk * chunk);
+
+// get output embeddings from the last encode pass
+// the reading size (in bytes) is equal to:
+// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
+MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
+
+/////////////////////////////////////////
+
+// test function, to be used in test-mtmd-c-api.c
+MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#ifdef __cplusplus
+
+namespace mtmd {
+
+struct mtmd_context_deleter {
+    void operator()(mtmd_context * val) { mtmd_free(val); }
+};
+using context_ptr = std::unique_ptr<mtmd_context, mtmd_context_deleter>;
+
+struct mtmd_bitmap_deleter {
+    void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); }
+};
+using bitmap_ptr = std::unique_ptr<mtmd_bitmap, mtmd_bitmap_deleter>;
+
+struct mtmd_input_chunks_deleter {
+    void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); }
+};
+using input_chunks_ptr = std::unique_ptr<mtmd_input_chunks, mtmd_input_chunks_deleter>;
+
+struct mtmd_input_chunk_deleter {
+    void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); }
+};
+using input_chunk_ptr = std::unique_ptr<mtmd_input_chunk, mtmd_input_chunk_deleter>;
+
+struct bitmap {
+    bitmap_ptr ptr;
+    bitmap() : ptr(nullptr) {}
+    bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {}
+    bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {}
+    bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) {
+        ptr.reset(mtmd_bitmap_init(nx, ny, data));
+    }
+    ~bitmap() = default;
+    uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); }
+    uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); }
+    const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); }
+    size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); }
+    std::string id() const { return mtmd_bitmap_get_id(ptr.get()); }
+    void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); }
+};
+
+struct bitmaps {
+    std::vector<bitmap> entries;
+    ~bitmaps() = default;
+    // return list of pointers to mtmd_bitmap
+    // example:
+    //   auto bitmaps_c_ptr = bitmaps.c_ptr();
+    //   int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size());
+    std::vector<const mtmd_bitmap *> c_ptr() {
+        std::vector<const mtmd_bitmap *> res(entries.size());
+        for (size_t i = 0; i < entries.size(); i++) {
+            res[i] = entries[i].ptr.get();
+        }
+        return res;
+    }
+};
+
+struct input_chunks {
+    input_chunks_ptr ptr;
+    input_chunks() = default;
+    input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {}
+    ~input_chunks() = default;
+    size_t size() const { return mtmd_input_chunks_size(ptr.get()); }
+    const mtmd_input_chunk * operator[](size_t idx) const {
+        return mtmd_input_chunks_get(ptr.get(), idx);
+    }
+};
+
+} // namespace mtmd
+
+#endif
+
+#endif
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a
index fffbd13..ae671c5 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ
diff --git a/ggml/llamacpp/wrapper.cpp b/ggml/llamacpp/wrapper.cpp
index 074bb15..e28e4e3 100644
--- a/ggml/llamacpp/wrapper.cpp
+++ b/ggml/llamacpp/wrapper.cpp
@@ -10,6 +10,8 @@
 #include "llama.h"
 #include "common.h"
 #include "sampling.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
 
 #include <string>
 #include <vector>
@@ -272,4 +274,191 @@ const char* go_llama_last_error(void) {
     return g_last_error.c_str();
 }
 
+// Cached multimodal context — lazily initialized, reused across calls.
+static mtmd_context* g_mtmd_ctx = nullptr;
+static std::string   g_mtmd_path;
+
+int go_llama_generate_with_images(
+    void* ctx_ptr,
+    void* model_ptr,
+    const char* mmproj_path,
+    const char* prompt,
+    go_llama_image* images,
+    int n_images,
+    go_llama_generate_params params,
+    go_llama_token_callback callback,
+    void* user_data)
+{
+    auto* ctx   = static_cast<llama_context*>(ctx_ptr);
+    auto* model = static_cast<llama_model*>(model_ptr);
+
+    if (!ctx || !model || !prompt || !mmproj_path) {
+        set_error("null context, model, prompt, or mmproj_path");
+        return -1;
+    }
+
+    // (Re-)initialize mtmd context if needed.
+    std::string path_str(mmproj_path);
+    if (!g_mtmd_ctx || g_mtmd_path != path_str) {
+        if (g_mtmd_ctx) {
+            mtmd_free(g_mtmd_ctx);
+            g_mtmd_ctx = nullptr;
+        }
+        mtmd_context_params mparams = mtmd_context_params_default();
+        mparams.use_gpu  = true;
+        mparams.warmup   = true;
+        g_mtmd_ctx = mtmd_init_from_file(mmproj_path, model, mparams);
+        if (!g_mtmd_ctx) {
+            set_error("failed to initialize mtmd context from mmproj");
+            return -1;
+        }
+        g_mtmd_path = path_str;
+    }
+
+    // Clear KV cache.
+    llama_memory_clear(llama_get_memory(ctx), false);
+
+    const llama_vocab* vocab = llama_model_get_vocab(model);
+    if (!vocab) {
+        set_error("failed to get vocab from model");
+        return -1;
+    }
+
+    // Build bitmaps from raw image bytes using the helper.
+    std::vector<mtmd_bitmap*> bitmaps(n_images);
+    for (int i = 0; i < n_images; i++) {
+        bitmaps[i] = mtmd_helper_bitmap_init_from_buf(
+            g_mtmd_ctx, images[i].data, (size_t)images[i].size);
+        if (!bitmaps[i]) {
+            // Free already-created bitmaps.
+            for (int j = 0; j < i; j++) {
+                mtmd_bitmap_free(bitmaps[j]);
+            }
+            set_error("failed to decode image data");
+            return -1;
+        }
+    }
+
+    // Build const pointer array for mtmd_tokenize.
+    std::vector<const mtmd_bitmap*> bitmap_ptrs(n_images);
+    for (int i = 0; i < n_images; i++) {
+        bitmap_ptrs[i] = bitmaps[i];
+    }
+
+    // Tokenize prompt + images.
+    mtmd_input_text input_text;
+    input_text.text         = prompt;
+    input_text.add_special  = true;
+    input_text.parse_special = true;
+
+    mtmd_input_chunks* chunks = mtmd_input_chunks_init();
+    int32_t tok_rc = mtmd_tokenize(g_mtmd_ctx, chunks, &input_text,
+                                   bitmap_ptrs.data(), (size_t)n_images);
+
+    // Free bitmaps — tokenize has consumed them.
+    for (int i = 0; i < n_images; i++) {
+        mtmd_bitmap_free(bitmaps[i]);
+    }
+
+    if (tok_rc != 0) {
+        mtmd_input_chunks_free(chunks);
+        set_error("mtmd_tokenize failed (marker/image count mismatch or preprocessing error)");
+        return -1;
+    }
+
+    // Evaluate all chunks (text + image) into KV cache using the helper.
+    llama_pos n_past = 0;
+    int32_t eval_rc = mtmd_helper_eval_chunks(g_mtmd_ctx, ctx, chunks,
+                                              /*n_past=*/0, /*seq_id=*/0,
+                                              (int32_t)llama_n_batch(ctx),
+                                              /*logits_last=*/true,
+                                              &n_past);
+    mtmd_input_chunks_free(chunks);
+    if (eval_rc != 0) {
+        set_error("mtmd_helper_eval_chunks failed");
+        return -1;
+    }
+
+    // Build sampling parameters (same as go_llama_generate).
+    common_params_sampling sparams;
+    sparams.seed            = (uint32_t)params.seed;
+    sparams.temp            = params.temperature;
+    sparams.top_k           = params.top_k;
+    sparams.top_p           = params.top_p;
+    sparams.min_p           = params.min_p;
+    sparams.penalty_repeat  = params.repeat_penalty;
+    sparams.penalty_freq    = params.freq_penalty;
+    sparams.penalty_present = params.presence_penalty;
+    sparams.penalty_last_n  = params.penalty_last_n;
+
+    common_sampler* smpl = common_sampler_init(model, sparams);
+    if (!smpl) {
+        set_error("failed to initialize sampler");
+        return -1;
+    }
+
+    // Generation loop.
+    char piece_buf[128];
+    int n_cur = (int)n_past;
+    const int n_ctx = (int)llama_n_ctx(ctx);
+    const int max_tokens = params.max_tokens > 0 ? params.max_tokens : 512;
+
+    for (int i = 0; i < max_tokens; i++) {
+        llama_token new_token = common_sampler_sample(smpl, ctx, -1);
+        common_sampler_accept(smpl, new_token, /*accept_grammar=*/true);
+
+        if (llama_vocab_is_eog(vocab, new_token)) {
+            break;
+        }
+
+        int n_piece = llama_token_to_piece(vocab, new_token,
+                                           piece_buf, sizeof(piece_buf) - 1,
+                                           /*lstrip=*/0, /*special=*/false);
+        if (n_piece < 0) {
+            continue;
+        }
+        piece_buf[n_piece] = '\0';
+
+        if (callback) {
+            bool cb_ok = callback(piece_buf, n_piece, user_data);
+            if (!cb_ok) {
+                break;
+            }
+        }
+
+        // Prepare next single-token batch.
+        llama_batch next_batch = llama_batch_init(1, 0, 1);
+        next_batch.token[0]     = new_token;
+        next_batch.pos[0]       = n_cur;
+        next_batch.n_seq_id[0]  = 1;
+        next_batch.seq_id[0][0] = 0;
+        next_batch.logits[0]    = 1;
+        next_batch.n_tokens     = 1;
+        n_cur++;
+
+        int rc = llama_decode(ctx, next_batch);
+        llama_batch_free(next_batch);
+        if (rc != 0) {
+            common_sampler_free(smpl);
+            set_error("llama_decode failed during generation");
+            return -1;
+        }
+
+        if (n_cur >= n_ctx) {
+            break;
+        }
+    }
+
+    common_sampler_free(smpl);
+    return 0;
+}
+
+void go_llama_mtmd_free(void) {
+    if (g_mtmd_ctx) {
+        mtmd_free(g_mtmd_ctx);
+        g_mtmd_ctx = nullptr;
+        g_mtmd_path.clear();
+    }
+}
+
 } // extern "C"
diff --git a/ggml/llamacpp/wrapper.h b/ggml/llamacpp/wrapper.h
index 3b4a3db..f82309b 100644
--- a/ggml/llamacpp/wrapper.h
+++ b/ggml/llamacpp/wrapper.h
@@ -55,6 +55,29 @@ int go_llama_embeddings(
 // Last error message (thread-local).
 const char* go_llama_last_error(void);
 
+// Image data for multimodal generation.
+typedef struct {
+    const unsigned char* data;
+    int size;
+} go_llama_image;
+
+// Generation with images: tokenize prompt + images via mtmd, evaluate, then generate.
+// mmproj_path: path to the multimodal projector GGUF file.
+// Returns 0 on success, -1 on error.
+int go_llama_generate_with_images(
+    void* ctx_ptr,
+    void* model_ptr,
+    const char* mmproj_path,
+    const char* prompt,
+    go_llama_image* images,
+    int n_images,
+    go_llama_generate_params params,
+    go_llama_token_callback callback,
+    void* user_data);
+
+// Free cached multimodal context (call on shutdown).
+void go_llama_mtmd_free(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a
index 2cc3042..c4e6da5 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ