diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go index 96ce0ab..ba0eb50 100644 --- a/ggml/llamacpp/llamacpp.go +++ b/ggml/llamacpp/llamacpp.go @@ -12,7 +12,7 @@ package llamacpp #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64 -#cgo darwin LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm +#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm #include #include #include "wrapper.h" @@ -317,6 +317,80 @@ func (c *Context) GetEmbeddings(text string) ([]float32, error) { return out[:n], nil } +// GenerateStreamWithImages runs multimodal generation with images, streaming tokens via callback. +// mmprojPath is the path to the multimodal projector GGUF file. +// images is a slice of raw image bytes (JPEG/PNG). +// Return false from cb to stop generation. +func (c *Context) GenerateStreamWithImages(prompt string, images [][]byte, mmprojPath string, cb func(token string) bool, opts ...GenerateOption) error { + c.mu.Lock() + defer c.mu.Unlock() + + if c.c == nil || c.model.c == nil { + return errors.New("context or model is closed") + } + + cfg := defaultGenerateConfig() + for _, o := range opts { + o(&cfg) + } + + cprompt := C.CString(prompt) + defer C.free(unsafe.Pointer(cprompt)) + + cmmproj := C.CString(mmprojPath) + defer C.free(unsafe.Pointer(cmmproj)) + + params := C.go_llama_default_generate_params() + params.max_tokens = C.int(cfg.maxTokens) + params.temperature = C.float(cfg.temperature) + params.top_k = C.int(cfg.topK) + params.top_p = C.float(cfg.topP) + params.min_p = C.float(cfg.minP) + params.repeat_penalty = C.float(cfg.repeatPenalty) + params.freq_penalty = C.float(cfg.freqPenalty) + params.presence_penalty = C.float(cfg.presencePenalty) + params.seed = C.int(cfg.seed) + params.penalty_last_n = C.int(cfg.penaltyLastN) + + // Build C array of go_llama_image. + var cImages *C.go_llama_image + if len(images) > 0 { + cImagesSlice := make([]C.go_llama_image, len(images)) + for i, img := range images { + cImagesSlice[i].data = (*C.uchar)(unsafe.Pointer(&img[0])) + cImagesSlice[i].size = C.int(len(img)) + } + cImages = &cImagesSlice[0] + } + + state := &streamState{cb: cb} + handle := registerCallback(state) + defer unregisterCallback(handle) + + rc := C.go_llama_generate_with_images( + unsafe.Pointer(c.c), + unsafe.Pointer(c.model.c), + cmmproj, + cprompt, + cImages, + C.int(len(images)), + params, + C.go_llama_token_callback(C.goTokenCallbackBridge), + unsafe.Pointer(handle), + ) + + if rc != 0 { + errMsg := C.GoString(C.go_llama_last_error()) + return fmt.Errorf("generate with images failed: %s", errMsg) + } + return nil +} + +// FreeMTMD frees the cached multimodal context. Call on shutdown. +func FreeMTMD() { + C.go_llama_mtmd_free() +} + // Close frees the context resources. func (c *Context) Close() { c.mu.Lock() diff --git a/ggml/llamacpp/llamacpp_android.go b/ggml/llamacpp/llamacpp_android.go index 1e2265f..f1b95b2 100644 --- a/ggml/llamacpp/llamacpp_android.go +++ b/ggml/llamacpp/llamacpp_android.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64 -#cgo android LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog +#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux.go b/ggml/llamacpp/llamacpp_linux.go index 0d79d74..a35551a 100644 --- a/ggml/llamacpp/llamacpp_linux.go +++ b/ggml/llamacpp/llamacpp_linux.go @@ -8,6 +8,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64 -#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux_cuda.go b/ggml/llamacpp/llamacpp_linux_cuda.go index 2e8578d..d7c8fd5 100644 --- a/ggml/llamacpp/llamacpp_linux_cuda.go +++ b/ggml/llamacpp/llamacpp_linux_cuda.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda -#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux_vulkan.go b/ggml/llamacpp/llamacpp_linux_vulkan.go index b15c767..97fea4b 100644 --- a/ggml/llamacpp/llamacpp_linux_vulkan.go +++ b/ggml/llamacpp/llamacpp_linux_vulkan.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan -#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/third_party/include/mtmd-helper.h b/ggml/llamacpp/third_party/include/mtmd-helper.h new file mode 100644 index 0000000..5036b92 --- /dev/null +++ b/ggml/llamacpp/third_party/include/mtmd-helper.h @@ -0,0 +1,96 @@ +#ifndef MTMD_HELPER_H +#define MTMD_HELPER_H + +#include "ggml.h" +#include "llama.h" +#include "mtmd.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// +// libmtmd helper functions +// +// Please note that these helpers are not guaranteed to be stable. +// BREAKING CHANGES are expected. +// + +// Set callback for all future logging events. +// If this is not called, or NULL is supplied, everything is output on stderr. +// Note: this also call mtmd_log_set() internally +MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data); + +// helper function to construct a mtmd_bitmap from a file +// it calls mtmd_helper_bitmap_init_from_buf() internally +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); + +// helper function to construct a mtmd_bitmap from a buffer containing a file +// supported formats: +// image: formats supported by stb_image: jpg, png, bmp, gif, etc. +// audio: formats supported by miniaudio: wav, mp3, flac +// note: audio files will be auto-detected based on magic bytes +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); + +// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache +MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); + +// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past +// normally, n_pos is equal to n_tokens, but for M-RoPE it is different +MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); + +// helper function that automatically: +// 1. run llama_decode() on text chunks +// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// otherwise, returns 0 on success +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunks * chunks, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// works like mtmd_helper_eval_chunks(), but only for a single chunk +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// helper function to decode an image whose embeddings have already been calculated +// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) +// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + float * encoded_embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#endif diff --git a/ggml/llamacpp/third_party/include/mtmd.h b/ggml/llamacpp/third_party/include/mtmd.h new file mode 100644 index 0000000..ebb4a18 --- /dev/null +++ b/ggml/llamacpp/third_party/include/mtmd.h @@ -0,0 +1,319 @@ +#ifndef MTMD_H +#define MTMD_H + +#include "ggml.h" +#include "llama.h" + +#include +#include +#include + +#ifdef __cplusplus +#include +#include +#include +#include +#endif + +/** + * libmtmd: A library for multimodal support in llama.cpp. + * + * WARNING: This API is experimental and subject to many BREAKING CHANGES. + * Issues related to API usage may receive lower priority support. + * + * For the usage, see an example in mtmd-cli.cpp + * + * For contributors: + * - Make sure the C API is aligned with the libllama C API (as in llama.h) + * - Do not include model name (e.g., qwen, gemma) in the API, use generic terms instead + * - Keep the API minimal, do not expose internal details unless necessary + * + * IMPORTANT: The mtmd module does NOT accept pull requests that are fully or predominantly AI-generated. + * We encourage human contributors to ensure the quality and reliability of the codebase. + */ + +#ifdef LLAMA_SHARED +# if defined(_WIN32) && !defined(__MINGW32__) +# ifdef LLAMA_BUILD +# define MTMD_API __declspec(dllexport) +# else +# define MTMD_API __declspec(dllimport) +# endif +# else +# define MTMD_API __attribute__ ((visibility ("default"))) +# endif +#else +# define MTMD_API +#endif + +// deprecated marker, use mtmd_default_marker() instead +#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>" + +#ifdef __cplusplus +extern "C" { +#endif + +enum mtmd_input_chunk_type { + MTMD_INPUT_CHUNK_TYPE_TEXT, + MTMD_INPUT_CHUNK_TYPE_IMAGE, + MTMD_INPUT_CHUNK_TYPE_AUDIO, +}; + +// opaque types +struct mtmd_context; +struct mtmd_bitmap; +struct mtmd_image_tokens; +struct mtmd_input_chunk; +struct mtmd_input_chunks; + +struct mtmd_input_text { + const char * text; + bool add_special; + bool parse_special; +}; + +// +// C API +// + +typedef struct mtmd_context mtmd_context; +typedef struct mtmd_bitmap mtmd_bitmap; +typedef struct mtmd_image_tokens mtmd_image_tokens; +typedef struct mtmd_input_chunk mtmd_input_chunk; +typedef struct mtmd_input_chunks mtmd_input_chunks; +typedef struct mtmd_input_text mtmd_input_text; + +struct mtmd_context_params { + bool use_gpu; + bool print_timings; + int n_threads; + const char * image_marker; // deprecated, use media_marker instead + const char * media_marker; + enum llama_flash_attn_type flash_attn_type; + bool warmup; // whether to run a warmup encode pass after initialization + + // limit number of image tokens, only for vision models with dynamic resolution + int image_min_tokens; // minimum number of tokens for image input (default: read from metadata) + int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) + + // callback function passed over to mtmd proper + ggml_backend_sched_eval_callback cb_eval; + void * cb_eval_user_data; +}; + +MTMD_API const char * mtmd_default_marker(void); + +MTMD_API struct mtmd_context_params mtmd_context_params_default(void); + +// initialize the mtmd context +// return nullptr on failure +MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, + const struct llama_model * text_model, + const struct mtmd_context_params ctx_params); + +MTMD_API void mtmd_free(mtmd_context * ctx); + +// whether we need to set non-causal mask before llama_decode +MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); + +// whether the current model use M-RoPE for llama_decode +MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); + +// whether the current model supports vision input +MTMD_API bool mtmd_support_vision(mtmd_context * ctx); + +// whether the current model supports audio input +MTMD_API bool mtmd_support_audio(mtmd_context * ctx); + +// get audio sample rate in Hz, for example 16000 for Whisper +// return -1 if audio is not supported +MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); + +// mtmd_bitmap +// +// if bitmap is image: +// length of data must be nx * ny * 3 +// the data is in RGBRGBRGB... format +// if bitmap is audio: +// length of data must be n_samples * sizeof(float) +// the data is in float format (PCM F32) +MTMD_API mtmd_bitmap * mtmd_bitmap_init (uint32_t nx, uint32_t ny, const unsigned char * data); +MTMD_API mtmd_bitmap * mtmd_bitmap_init_from_audio(size_t n_samples, const float * data); +MTMD_API uint32_t mtmd_bitmap_get_nx (const mtmd_bitmap * bitmap); +MTMD_API uint32_t mtmd_bitmap_get_ny (const mtmd_bitmap * bitmap); +MTMD_API const unsigned char * mtmd_bitmap_get_data (const mtmd_bitmap * bitmap); +MTMD_API size_t mtmd_bitmap_get_n_bytes(const mtmd_bitmap * bitmap); +MTMD_API bool mtmd_bitmap_is_audio (const mtmd_bitmap * bitmap); +MTMD_API void mtmd_bitmap_free (mtmd_bitmap * bitmap); +// bitmap ID is optional, but useful for KV cache tracking +// these getters/setters are dedicated functions, so you can for example calculate the hash of the image based on mtmd_bitmap_get_data() +MTMD_API const char * mtmd_bitmap_get_id(const mtmd_bitmap * bitmap); +MTMD_API void mtmd_bitmap_set_id(mtmd_bitmap * bitmap, const char * id); + + +// mtmd_input_chunks +// +// this is simply a list of mtmd_input_chunk +// the elements can only be populated via mtmd_tokenize() +MTMD_API mtmd_input_chunks * mtmd_input_chunks_init(void); +MTMD_API size_t mtmd_input_chunks_size(const mtmd_input_chunks * chunks); +MTMD_API const mtmd_input_chunk * mtmd_input_chunks_get (const mtmd_input_chunks * chunks, size_t idx); +MTMD_API void mtmd_input_chunks_free(mtmd_input_chunks * chunks); + +// mtmd_input_chunk +// +// the instance will be constructed via mtmd_tokenize() +// it will be freed along with mtmd_input_chunks +MTMD_API enum mtmd_input_chunk_type mtmd_input_chunk_get_type (const mtmd_input_chunk * chunk); +MTMD_API const llama_token * mtmd_input_chunk_get_tokens_text (const mtmd_input_chunk * chunk, size_t * n_tokens_output); +MTMD_API const mtmd_image_tokens * mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk); +MTMD_API size_t mtmd_input_chunk_get_n_tokens (const mtmd_input_chunk * chunk); +// returns nullptr for ID on text chunk +MTMD_API const char * mtmd_input_chunk_get_id (const mtmd_input_chunk * chunk); +// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) +MTMD_API llama_pos mtmd_input_chunk_get_n_pos (const mtmd_input_chunk * chunk); + +// in case you want to use custom logic to handle the chunk (i.e. KV cache management) +// you can move the chunk ownership to your own code by copying it +// remember to free the chunk when you are done with it +MTMD_API mtmd_input_chunk * mtmd_input_chunk_copy(const mtmd_input_chunk * chunk); +MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); + + +// mtmd_image_tokens +// +// the instance will be constructed via mtmd_tokenize() +// it will be freed along with mtmd_input_chunk +MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate +MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); +MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); +MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate +// number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) +MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate + +// tokenize an input text prompt and a list of bitmaps (images/audio) +// the prompt must have the input image marker (default: "<__media__>") in it +// the default marker is defined by mtmd_default_marker() +// the marker will be replaced with the image/audio chunk +// for example: +// "here is an image: <__media__>\ndescribe it in detail." +// this will gives 3 chunks: +// 1. "here is an image: " +// 2. (image/audio tokens) +// 3. "\ndescribe it in detail." +// number of bitmaps must be equal to the number of markers in the prompt +// this function is thread-safe (shared ctx) +// return values: +// 0 on success +// 1 on number of bitmaps not matching the number of markers +// 2 on image preprocessing error +MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, + mtmd_input_chunks * output, + const mtmd_input_text * text, + const mtmd_bitmap ** bitmaps, + size_t n_bitmaps); + +// returns 0 on success +// TODO: deprecate +MTMD_API int32_t mtmd_encode(mtmd_context * ctx, + const mtmd_image_tokens * image_tokens); + +// returns 0 on success +MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, + const mtmd_input_chunk * chunk); + +// get output embeddings from the last encode pass +// the reading size (in bytes) is equal to: +// llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) +MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); + +// Set callback for all future logging events. +// If this is not called, or NULL is supplied, everything is output on stderr. +MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); + +///////////////////////////////////////// + +// test function, to be used in test-mtmd-c-api.c +MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#ifdef __cplusplus + +namespace mtmd { + +struct mtmd_context_deleter { + void operator()(mtmd_context * val) { mtmd_free(val); } +}; +using context_ptr = std::unique_ptr; + +struct mtmd_bitmap_deleter { + void operator()(mtmd_bitmap * val) { mtmd_bitmap_free(val); } +}; +using bitmap_ptr = std::unique_ptr; + +struct mtmd_input_chunks_deleter { + void operator()(mtmd_input_chunks * val) { mtmd_input_chunks_free(val); } +}; +using input_chunks_ptr = std::unique_ptr; + +struct mtmd_input_chunk_deleter { + void operator()(mtmd_input_chunk * val) { mtmd_input_chunk_free(val); } +}; +using input_chunk_ptr = std::unique_ptr; + +struct bitmap { + bitmap_ptr ptr; + bitmap() : ptr(nullptr) {} + bitmap(mtmd_bitmap * bitmap) : ptr(bitmap) {} + bitmap(bitmap && other) noexcept : ptr(std::move(other.ptr)) {} + bitmap(uint32_t nx, uint32_t ny, const unsigned char * data) { + ptr.reset(mtmd_bitmap_init(nx, ny, data)); + } + ~bitmap() = default; + uint32_t nx() const { return mtmd_bitmap_get_nx(ptr.get()); } + uint32_t ny() const { return mtmd_bitmap_get_ny(ptr.get()); } + const unsigned char * data() const { return mtmd_bitmap_get_data(ptr.get()); } + size_t n_bytes() const { return mtmd_bitmap_get_n_bytes(ptr.get()); } + std::string id() const { return mtmd_bitmap_get_id(ptr.get()); } + void set_id(const char * id) const { mtmd_bitmap_set_id(ptr.get(), id); } +}; + +struct bitmaps { + std::vector entries; + ~bitmaps() = default; + // return list of pointers to mtmd_bitmap + // example: + // auto bitmaps_c_ptr = bitmaps.c_ptr(); + // int32_t res = mtmd_tokenize(... bitmaps_c_ptr.data(), bitmaps_c_ptr.size()); + std::vector c_ptr() { + std::vector res(entries.size()); + for (size_t i = 0; i < entries.size(); i++) { + res[i] = entries[i].ptr.get(); + } + return res; + } +}; + +struct input_chunks { + input_chunks_ptr ptr; + input_chunks() = default; + input_chunks(mtmd_input_chunks * chunks) : ptr(chunks) {} + ~input_chunks() = default; + size_t size() const { return mtmd_input_chunks_size(ptr.get()); } + const mtmd_input_chunk * operator[](size_t idx) const { + return mtmd_input_chunks_get(ptr.get(), idx); + } +}; + +} // namespace mtmd + +#endif + +#endif diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a index fffbd13..ae671c5 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ diff --git a/ggml/llamacpp/wrapper.cpp b/ggml/llamacpp/wrapper.cpp index 074bb15..e28e4e3 100644 --- a/ggml/llamacpp/wrapper.cpp +++ b/ggml/llamacpp/wrapper.cpp @@ -10,6 +10,8 @@ #include "llama.h" #include "common.h" #include "sampling.h" +#include "mtmd.h" +#include "mtmd-helper.h" #include #include @@ -272,4 +274,191 @@ const char* go_llama_last_error(void) { return g_last_error.c_str(); } +// Cached multimodal context — lazily initialized, reused across calls. +static mtmd_context* g_mtmd_ctx = nullptr; +static std::string g_mtmd_path; + +int go_llama_generate_with_images( + void* ctx_ptr, + void* model_ptr, + const char* mmproj_path, + const char* prompt, + go_llama_image* images, + int n_images, + go_llama_generate_params params, + go_llama_token_callback callback, + void* user_data) +{ + auto* ctx = static_cast(ctx_ptr); + auto* model = static_cast(model_ptr); + + if (!ctx || !model || !prompt || !mmproj_path) { + set_error("null context, model, prompt, or mmproj_path"); + return -1; + } + + // (Re-)initialize mtmd context if needed. + std::string path_str(mmproj_path); + if (!g_mtmd_ctx || g_mtmd_path != path_str) { + if (g_mtmd_ctx) { + mtmd_free(g_mtmd_ctx); + g_mtmd_ctx = nullptr; + } + mtmd_context_params mparams = mtmd_context_params_default(); + mparams.use_gpu = true; + mparams.warmup = true; + g_mtmd_ctx = mtmd_init_from_file(mmproj_path, model, mparams); + if (!g_mtmd_ctx) { + set_error("failed to initialize mtmd context from mmproj"); + return -1; + } + g_mtmd_path = path_str; + } + + // Clear KV cache. + llama_memory_clear(llama_get_memory(ctx), false); + + const llama_vocab* vocab = llama_model_get_vocab(model); + if (!vocab) { + set_error("failed to get vocab from model"); + return -1; + } + + // Build bitmaps from raw image bytes using the helper. + std::vector bitmaps(n_images); + for (int i = 0; i < n_images; i++) { + bitmaps[i] = mtmd_helper_bitmap_init_from_buf( + g_mtmd_ctx, images[i].data, (size_t)images[i].size); + if (!bitmaps[i]) { + // Free already-created bitmaps. + for (int j = 0; j < i; j++) { + mtmd_bitmap_free(bitmaps[j]); + } + set_error("failed to decode image data"); + return -1; + } + } + + // Build const pointer array for mtmd_tokenize. + std::vector bitmap_ptrs(n_images); + for (int i = 0; i < n_images; i++) { + bitmap_ptrs[i] = bitmaps[i]; + } + + // Tokenize prompt + images. + mtmd_input_text input_text; + input_text.text = prompt; + input_text.add_special = true; + input_text.parse_special = true; + + mtmd_input_chunks* chunks = mtmd_input_chunks_init(); + int32_t tok_rc = mtmd_tokenize(g_mtmd_ctx, chunks, &input_text, + bitmap_ptrs.data(), (size_t)n_images); + + // Free bitmaps — tokenize has consumed them. + for (int i = 0; i < n_images; i++) { + mtmd_bitmap_free(bitmaps[i]); + } + + if (tok_rc != 0) { + mtmd_input_chunks_free(chunks); + set_error("mtmd_tokenize failed (marker/image count mismatch or preprocessing error)"); + return -1; + } + + // Evaluate all chunks (text + image) into KV cache using the helper. + llama_pos n_past = 0; + int32_t eval_rc = mtmd_helper_eval_chunks(g_mtmd_ctx, ctx, chunks, + /*n_past=*/0, /*seq_id=*/0, + (int32_t)llama_n_batch(ctx), + /*logits_last=*/true, + &n_past); + mtmd_input_chunks_free(chunks); + if (eval_rc != 0) { + set_error("mtmd_helper_eval_chunks failed"); + return -1; + } + + // Build sampling parameters (same as go_llama_generate). + common_params_sampling sparams; + sparams.seed = (uint32_t)params.seed; + sparams.temp = params.temperature; + sparams.top_k = params.top_k; + sparams.top_p = params.top_p; + sparams.min_p = params.min_p; + sparams.penalty_repeat = params.repeat_penalty; + sparams.penalty_freq = params.freq_penalty; + sparams.penalty_present = params.presence_penalty; + sparams.penalty_last_n = params.penalty_last_n; + + common_sampler* smpl = common_sampler_init(model, sparams); + if (!smpl) { + set_error("failed to initialize sampler"); + return -1; + } + + // Generation loop. + char piece_buf[128]; + int n_cur = (int)n_past; + const int n_ctx = (int)llama_n_ctx(ctx); + const int max_tokens = params.max_tokens > 0 ? params.max_tokens : 512; + + for (int i = 0; i < max_tokens; i++) { + llama_token new_token = common_sampler_sample(smpl, ctx, -1); + common_sampler_accept(smpl, new_token, /*accept_grammar=*/true); + + if (llama_vocab_is_eog(vocab, new_token)) { + break; + } + + int n_piece = llama_token_to_piece(vocab, new_token, + piece_buf, sizeof(piece_buf) - 1, + /*lstrip=*/0, /*special=*/false); + if (n_piece < 0) { + continue; + } + piece_buf[n_piece] = '\0'; + + if (callback) { + bool cb_ok = callback(piece_buf, n_piece, user_data); + if (!cb_ok) { + break; + } + } + + // Prepare next single-token batch. + llama_batch next_batch = llama_batch_init(1, 0, 1); + next_batch.token[0] = new_token; + next_batch.pos[0] = n_cur; + next_batch.n_seq_id[0] = 1; + next_batch.seq_id[0][0] = 0; + next_batch.logits[0] = 1; + next_batch.n_tokens = 1; + n_cur++; + + int rc = llama_decode(ctx, next_batch); + llama_batch_free(next_batch); + if (rc != 0) { + common_sampler_free(smpl); + set_error("llama_decode failed during generation"); + return -1; + } + + if (n_cur >= n_ctx) { + break; + } + } + + common_sampler_free(smpl); + return 0; +} + +void go_llama_mtmd_free(void) { + if (g_mtmd_ctx) { + mtmd_free(g_mtmd_ctx); + g_mtmd_ctx = nullptr; + g_mtmd_path.clear(); + } +} + } // extern "C" diff --git a/ggml/llamacpp/wrapper.h b/ggml/llamacpp/wrapper.h index 3b4a3db..f82309b 100644 --- a/ggml/llamacpp/wrapper.h +++ b/ggml/llamacpp/wrapper.h @@ -55,6 +55,29 @@ int go_llama_embeddings( // Last error message (thread-local). const char* go_llama_last_error(void); +// Image data for multimodal generation. +typedef struct { + const unsigned char* data; + int size; +} go_llama_image; + +// Generation with images: tokenize prompt + images via mtmd, evaluate, then generate. +// mmproj_path: path to the multimodal projector GGUF file. +// Returns 0 on success, -1 on error. +int go_llama_generate_with_images( + void* ctx_ptr, + void* model_ptr, + const char* mmproj_path, + const char* prompt, + go_llama_image* images, + int n_images, + go_llama_generate_params params, + go_llama_token_callback callback, + void* user_data); + +// Free cached multimodal context (call on shutdown). +void go_llama_mtmd_free(void); + #ifdef __cplusplus } #endif diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a index 2cc3042..c4e6da5 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ