Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 75 additions & 1 deletion ggml/llamacpp/llamacpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ package llamacpp
#cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common
#cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64
#cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
#cgo darwin LDFLAGS: -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
#include <stdlib.h>
#include <stdbool.h>
#include "wrapper.h"
Expand Down Expand Up @@ -317,6 +317,80 @@ func (c *Context) GetEmbeddings(text string) ([]float32, error) {
return out[:n], nil
}

// GenerateStreamWithImages runs multimodal generation with images, streaming tokens via callback.
// mmprojPath is the path to the multimodal projector GGUF file.
// images is a slice of raw image bytes (JPEG/PNG).
// Return false from cb to stop generation.
func (c *Context) GenerateStreamWithImages(prompt string, images [][]byte, mmprojPath string, cb func(token string) bool, opts ...GenerateOption) error {
c.mu.Lock()
defer c.mu.Unlock()

if c.c == nil || c.model.c == nil {
return errors.New("context or model is closed")
}

cfg := defaultGenerateConfig()
for _, o := range opts {
o(&cfg)
}

cprompt := C.CString(prompt)
defer C.free(unsafe.Pointer(cprompt))

cmmproj := C.CString(mmprojPath)
defer C.free(unsafe.Pointer(cmmproj))

params := C.go_llama_default_generate_params()
params.max_tokens = C.int(cfg.maxTokens)
params.temperature = C.float(cfg.temperature)
params.top_k = C.int(cfg.topK)
params.top_p = C.float(cfg.topP)
params.min_p = C.float(cfg.minP)
params.repeat_penalty = C.float(cfg.repeatPenalty)
params.freq_penalty = C.float(cfg.freqPenalty)
params.presence_penalty = C.float(cfg.presencePenalty)
params.seed = C.int(cfg.seed)
params.penalty_last_n = C.int(cfg.penaltyLastN)

// Build C array of go_llama_image.
var cImages *C.go_llama_image
if len(images) > 0 {
cImagesSlice := make([]C.go_llama_image, len(images))
for i, img := range images {
cImagesSlice[i].data = (*C.uchar)(unsafe.Pointer(&img[0]))
cImagesSlice[i].size = C.int(len(img))
}
cImages = &cImagesSlice[0]
}

state := &streamState{cb: cb}
handle := registerCallback(state)
defer unregisterCallback(handle)

rc := C.go_llama_generate_with_images(
unsafe.Pointer(c.c),
unsafe.Pointer(c.model.c),
cmmproj,
cprompt,
cImages,
C.int(len(images)),
params,
C.go_llama_token_callback(C.goTokenCallbackBridge),
unsafe.Pointer(handle),
)

if rc != 0 {
errMsg := C.GoString(C.go_llama_last_error())
return fmt.Errorf("generate with images failed: %s", errMsg)
}
return nil
}

// FreeMTMD frees the cached multimodal context. Call on shutdown.
func FreeMTMD() {
C.go_llama_mtmd_free()
}

// Close frees the context resources.
func (c *Context) Close() {
c.mu.Lock()
Expand Down
2 changes: 1 addition & 1 deletion ggml/llamacpp/llamacpp_android.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ package llamacpp

/*
#cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64
#cgo android LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
*/
import "C"
2 changes: 1 addition & 1 deletion ggml/llamacpp/llamacpp_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ package llamacpp
/*
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64
#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
*/
import "C"
2 changes: 1 addition & 1 deletion ggml/llamacpp/llamacpp_linux_cuda.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ package llamacpp

/*
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda
#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
*/
import "C"
2 changes: 1 addition & 1 deletion ggml/llamacpp/llamacpp_linux_vulkan.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ package llamacpp

/*
#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan
#cgo linux LDFLAGS: -Wl,--start-group -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
*/
import "C"
96 changes: 96 additions & 0 deletions ggml/llamacpp/third_party/include/mtmd-helper.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#ifndef MTMD_HELPER_H
#define MTMD_HELPER_H

#include "ggml.h"
#include "llama.h"
#include "mtmd.h"

#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>

#ifdef __cplusplus
extern "C" {
#endif

//
// libmtmd helper functions
//
// Please note that these helpers are not guaranteed to be stable.
// BREAKING CHANGES are expected.
//

// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
// Note: this also call mtmd_log_set() internally
MTMD_API void mtmd_helper_log_set(ggml_log_callback log_callback, void * user_data);

// helper function to construct a mtmd_bitmap from a file
// it calls mtmd_helper_bitmap_init_from_buf() internally
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);

// helper function to construct a mtmd_bitmap from a buffer containing a file
// supported formats:
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
// audio: formats supported by miniaudio: wav, mp3, flac
// note: audio files will be auto-detected based on magic bytes
// returns nullptr on failure
// this function is thread-safe
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);

// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);

// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);

// helper function that automatically:
// 1. run llama_decode() on text chunks
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
// otherwise, returns 0 on success
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunks * chunks,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);

// works like mtmd_helper_eval_chunks(), but only for a single chunk
// this function is NOT thread-safe
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
bool logits_last,
llama_pos * new_n_past);

// helper function to decode an image whose embeddings have already been calculated
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
struct llama_context * lctx,
const mtmd_input_chunk * chunk,
float * encoded_embd,
llama_pos n_past,
llama_seq_id seq_id,
int32_t n_batch,
llama_pos * new_n_past);

#ifdef __cplusplus
} // extern "C"
#endif

//
// C++ wrappers
//

#endif
Loading
Loading