FootprintAI · hsinatfootprintai · Apr 14, 2026 · Apr 14, 2026
diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go
@@ -12,7 +12,7 @@ package llamacpp
 #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common
 #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64
 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
-#cgo darwin LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
+#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
 #include <stdlib.h>
 #include <stdbool.h>
 #include "wrapper.h"
@@ -317,6 +317,80 @@ func (c *Context) GetEmbeddings(text string) ([]float32, error) {
 	return out[:n], nil
 }
 
+// GenerateStreamWithImages runs multimodal generation with images, streaming tokens via callback.
+// mmprojPath is the path to the multimodal projector GGUF file.
+// images is a slice of raw image bytes (JPEG/PNG).
+// Return false from cb to stop generation.
+func (c *Context) GenerateStreamWithImages(prompt string, images [][]byte, mmprojPath string, cb func(token string) bool, opts ...GenerateOption) error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.c == nil || c.model.c == nil {
+		return errors.New("context or model is closed")
+	}
+
+	cfg := defaultGenerateConfig()
+	for _, o := range opts {
+		o(&cfg)
+	}
+
+	cprompt := C.CString(prompt)
+	defer C.free(unsafe.Pointer(cprompt))
+
+	cmmproj := C.CString(mmprojPath)
+	defer C.free(unsafe.Pointer(cmmproj))
+
+	params := C.go_llama_default_generate_params()
+	params.max_tokens = C.int(cfg.maxTokens)
+	params.temperature = C.float(cfg.temperature)
+	params.top_k = C.int(cfg.topK)
+	params.top_p = C.float(cfg.topP)
+	params.min_p = C.float(cfg.minP)
+	params.repeat_penalty = C.float(cfg.repeatPenalty)
+	params.freq_penalty = C.float(cfg.freqPenalty)
+	params.presence_penalty = C.float(cfg.presencePenalty)
+	params.seed = C.int(cfg.seed)
+	params.penalty_last_n = C.int(cfg.penaltyLastN)
+
+	// Build C array of go_llama_image.
+	var cImages *C.go_llama_image
+	if len(images) > 0 {
+		cImagesSlice := make([]C.go_llama_image, len(images))
+		for i, img := range images {
+			cImagesSlice[i].data = (*C.uchar)(unsafe.Pointer(&img[0]))
+			cImagesSlice[i].size = C.int(len(img))
+		}
+		cImages = &cImagesSlice[0]
+	}
+
+	state := &streamState{cb: cb}
+	handle := registerCallback(state)
+	defer unregisterCallback(handle)
+
+	rc := C.go_llama_generate_with_images(
+		unsafe.Pointer(c.c),
+		unsafe.Pointer(c.model.c),
+		cmmproj,
+		cprompt,
+		cImages,
+		C.int(len(images)),
+		params,
+		C.go_llama_token_callback(C.goTokenCallbackBridge),
+		unsafe.Pointer(handle),
+	)
+
+	if rc != 0 {
+		errMsg := C.GoString(C.go_llama_last_error())
+		return fmt.Errorf("generate with images failed: %s", errMsg)
+	}
+	return nil
+}
+
+// FreeMTMD frees the cached multimodal context. Call on shutdown.
+func FreeMTMD() {
+	C.go_llama_mtmd_free()
+}
+
 // Close frees the context resources.
 func (c *Context) Close() {
 	c.mu.Lock()

diff --git a/ggml/llamacpp/llamacpp_android.go b/ggml/llamacpp/llamacpp_android.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64
-#cgo android LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
+#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux.go b/ggml/llamacpp/llamacpp_linux.go
@@ -8,6 +8,6 @@ package llamacpp
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_cuda.go b/ggml/llamacpp/llamacpp_linux_cuda.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_vulkan.go b/ggml/llamacpp/llamacpp_linux_vulkan.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan
-#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/third_party/include/mtmd-helper.h b/ggml/llamacpp/third_party/include/mtmd-helper.h
@@ -0,0 +1,96 @@
+#ifndef MTMD_HELPER_H
+#define MTMD_HELPER_H
+
+#include "ggml.h"
+#include "llama.h"
+#include "mtmd.h"
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// libmtmd helper functions
+//
+// Please note that these helpers are not guaranteed to be stable.
+// BREAKING CHANGES are expected.
+//
+
+// Set callback for all future logging events.
+// If this is not called, or NULL is supplied, everything is output on stderr.
+// Note: this also call mtmd_log_set() internally
+MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);
+
+// helper function to construct a mtmd_bitmap from a file
+// it calls mtmd_helper_bitmap_init_from_buf() internally
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
+
+// helper function to construct a mtmd_bitmap from a buffer containing a file
+// supported formats:
+//     image: formats supported by stb_image: jpg, png, bmp, gif, etc.
+//     audio: formats supported by miniaudio: wav, mp3, flac
+// note: audio files will be auto-detected based on magic bytes
+// returns nullptr on failure
+// this function is thread-safe
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
+
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
+MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
+
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
+MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
+
+// helper function that automatically:
+// 1. run llama_decode() on text chunks
+// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
+// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
+// otherwise, returns 0 on success
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
+                                         struct llama_context * lctx,
+                                         const mtmd_input_chunks * chunks,
+                                         llama_pos n_past,
+                                         llama_seq_id seq_id,
+                                         int32_t n_batch,
+                                         bool logits_last,
+                                         llama_pos * new_n_past);
+
+// works like mtmd_helper_eval_chunks(), but only for a single chunk
+// this function is NOT thread-safe
+MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
+                                               struct llama_context * lctx,
+                                               const mtmd_input_chunk * chunk,
+                                               llama_pos n_past,
+                                               llama_seq_id seq_id,
+                                               int32_t n_batch,
+                                               bool logits_last,
+                                               llama_pos * new_n_past);
+
+// helper function to decode an image whose embeddings have already been calculated
+// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
+// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
+MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
+                                                struct llama_context * lctx,
+                                                const mtmd_input_chunk * chunk,
+                                                float * encoded_embd,
+                                                llama_pos n_past,
+                                                llama_seq_id seq_id,
+                                                int32_t n_batch,
+                                                llama_pos * new_n_past);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+//
+// C++ wrappers
+//
+
+#endif