From 2386891b0671bc34aac27f3f8e214bb50644c017 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Tue, 25 Nov 2025 18:55:14 +0100 Subject: [PATCH 01/13] Adjust to pytorch --- tools/mtmd/clip.cpp | 8 ++++---- tools/mtmd/mtmd.cpp | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 52ea542decc..18f62bf5868 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3737,12 +3737,13 @@ struct img_tool { const int width = inp_size.width; const int height = inp_size.height; + auto round_by_factor = [f = align_size](float x) { return static_cast(std::round(x / static_cast(f))) * f; }; auto ceil_by_factor = [f = align_size](float x) { return static_cast(std::ceil(x / static_cast(f))) * f; }; auto floor_by_factor = [f = align_size](float x) { return static_cast(std::floor(x / static_cast(f))) * f; }; // always align up first - int h_bar = std::max(align_size, ceil_by_factor(height)); - int w_bar = std::max(align_size, ceil_by_factor(width)); + int h_bar = std::max(align_size, round_by_factor(height)); + int w_bar = std::max(align_size, round_by_factor(width)); if (h_bar * w_bar > max_pixels) { const auto beta = std::sqrt(static_cast(height * width) / max_pixels); @@ -4354,10 +4355,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str params.patch_size * params.n_merge, params.image_min_pixels, params.image_max_pixels); - const std::array pad_color = {122, 116, 104}; clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color); + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, false); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index dfad9cd7957..6690bf30046 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -304,6 +304,10 @@ struct mtmd_context { img_beg = "<|im_start|>"; img_end = "<|im_end|>"; + } else if (proj == PROJECTOR_TYPE_LFM2) { + img_beg = "<|image_start|>"; + img_end = "<|image_end|>"; + } } From c5090733d81cec544c867b6da91f253ce472b115 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 27 Nov 2025 13:23:02 +0100 Subject: [PATCH 02/13] Add antialiasing upscale --- ggml/include/ggml.h | 7 ++-- ggml/src/ggml-cpu/ops.cpp | 87 +++++++++++++++++++++++++++++++++++++++ tools/mtmd/clip.cpp | 2 +- 3 files changed, 92 insertions(+), 4 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 4dbca868bc7..1bffb35ae9f 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2140,9 +2140,10 @@ extern "C" { float p1); enum ggml_scale_mode { - GGML_SCALE_MODE_NEAREST = 0, - GGML_SCALE_MODE_BILINEAR = 1, - GGML_SCALE_MODE_BICUBIC = 2, + GGML_SCALE_MODE_NEAREST = 0, + GGML_SCALE_MODE_BILINEAR = 1, + GGML_SCALE_MODE_BICUBIC = 2, + GGML_SCALE_MODE_BILINEAR_AA = 3, // bilinear with antialiasing (box filter for downsampling) GGML_SCALE_MODE_COUNT }; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 2745fc54e15..cf2ccfd2fb9 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7500,6 +7500,93 @@ static void ggml_compute_forward_upscale_f32( bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx), bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy); + float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + *y_dst = val; + } + } + } + } + } else if (mode == GGML_SCALE_MODE_BILINEAR_AA) { + // Bilinear with antialiasing - matches PyTorch's F.interpolate(..., mode='bilinear', antialias=True) + // This implementation follows PyTorch's approach: + // - scale = input_size / output_size (NOT output/input!) + // - For downsampling (scale > 1): support = 1.0 * scale, invscale = 1.0 / scale + // - For upsampling (scale <= 1): support = 1.0, invscale = 1.0 + // See: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/UpSampleKernel.cpp + + const int interp_size = 2; // bilinear + + // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|) + auto bilinear_filter = [](float x) -> float { + x = fabsf(x); + if (x < 1.0f) { + return 1.0f - x; + } + return 0.0f; + }; + + // Compute scales as input_size / output_size + const float scale0 = (float)ne00 / (float)ne0; + const float scale1 = (float)ne01 / (float)ne1; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + // Compute center position in source coordinates + // PyTorch formula: center = scale * (i + 0.5) + const float center_y = scale1 * ((float)i1 + 0.5f); + + // Compute support and invscale for y direction + // When downsampling (scale > 1), we need wider support for antialiasing + const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f; + const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + const float center_x = scale0 * ((float)i0 + 0.5f); + + // Compute support and invscale for x direction + const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f; + const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f; + + // Calculate the range of source pixels that contribute + const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f)); + const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f)); + const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f)); + const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f)); + + float val = 0.0f; + float total_weight = 0.0f; + + // Apply bilinear filter with antialiasing + for (int64_t sy = y_min; sy < y_max; sy++) { + // Compute bilinear weight for y direction + const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y); + + for (int64_t sx = x_min; sx < x_max; sx++) { + // Compute bilinear weight for x direction + const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x); + + const float weight = weight_x * weight_y; + + if (weight > 0.0f) { + const float pixel = *(const float *)((const char *)src0->data + + sx*nb00 + + sy*nb01 + + i02*nb02 + + i03*nb03); + val += pixel * weight; + total_weight += weight; + } + } + } + + // Normalize by total weight + if (total_weight > 0.0f) { + val /= total_weight; + } + float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y_dst = val; } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 18f62bf5868..f571a6102c8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2012,7 +2012,7 @@ struct clip_graph { ggml_tensor * pos_embd = model.position_embeddings; const int height = img.ny / patch_size; const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR; + const uint32_t mode = GGML_SCALE_MODE_BILINEAR_AA; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); GGML_ASSERT(pos_embd); From 80b4e97e3aac43d1dc27fab79271c43fa57f643e Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 27 Nov 2025 13:23:33 +0100 Subject: [PATCH 03/13] Increase number of patches to 1024 --- tools/mtmd/clip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f571a6102c8..9cd37835451 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2787,7 +2787,7 @@ struct clip_model_loader { { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json - hparams.set_limit_image_tokens(64, 256); + hparams.set_limit_image_tokens(64, 1024); } break; case PROJECTOR_TYPE_PIXTRAL: case PROJECTOR_TYPE_LIGHTONOCR: From 1cd4e2fd59185c76ac6e45b65b93288eae3f56dc Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 27 Nov 2025 13:54:04 +0100 Subject: [PATCH 04/13] Handle default marker insertion for LFM2 --- tools/mtmd/mtmd-cli.cpp | 4 ++-- tools/mtmd/mtmd.cpp | 9 +++++++++ tools/mtmd/mtmd.h | 3 +++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 6679de309b4..fff8714bfe3 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -313,7 +313,7 @@ int main(int argc, char ** argv) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { for (size_t i = 0; i < params.image.size(); i++) { - params.prompt += mtmd_default_marker(); + params.prompt = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), params.prompt); } } common_chat_msg msg; @@ -378,7 +378,7 @@ int main(int argc, char ** argv) { std::string media_path = line.substr(7); if (ctx.load_media(media_path)) { LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio"); - content += mtmd_default_marker(); + content = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), content); } // else, error is already printed by libmtmd continue; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 6690bf30046..e616c464d05 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1103,3 +1103,12 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +std::string mtmd::mtmd_add_default_marker(mtmd_context *ctx, const std::string &str) { + // for LFM2 image embeddings positioned before the text + if (ctx && ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_LFM2) { + return mtmd_default_marker() + str; + } + + return str + mtmd_default_marker(); +} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 015119be897..d204bd594db 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -299,6 +299,9 @@ struct input_chunks { } }; +// insert mtmd_default_marker() into given string, position depends on the projector +std::string mtmd_add_default_marker(mtmd_context *ctx, const std::string &str); + } // namespace mtmd #endif From 40e08b83a8f67f2ac5b135612afabc7f215ca0e4 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 27 Nov 2025 15:06:12 +0100 Subject: [PATCH 05/13] Switch to flag --- ggml/include/ggml.h | 10 +-- ggml/src/ggml-cpu/ops.cpp | 180 ++++++++++++++++++++------------------ tools/mtmd/clip.cpp | 2 +- 3 files changed, 99 insertions(+), 93 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 1bffb35ae9f..48da68fe7e3 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2140,16 +2140,16 @@ extern "C" { float p1); enum ggml_scale_mode { - GGML_SCALE_MODE_NEAREST = 0, - GGML_SCALE_MODE_BILINEAR = 1, - GGML_SCALE_MODE_BICUBIC = 2, - GGML_SCALE_MODE_BILINEAR_AA = 3, // bilinear with antialiasing (box filter for downsampling) + GGML_SCALE_MODE_NEAREST = 0, + GGML_SCALE_MODE_BILINEAR = 1, + GGML_SCALE_MODE_BICUBIC = 2, GGML_SCALE_MODE_COUNT }; enum ggml_scale_flag { - GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8) + GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8), + GGML_SCALE_FLAG_ANTIALIAS = (1 << 9), }; // interpolate diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index cf2ccfd2fb9..f6f2805ef23 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7402,6 +7402,99 @@ static void ggml_compute_forward_upscale_f32( sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; } + // Antialiasing preprocessing step + // Apply antialiasing filter if flag is set and write directly to dst + bool antialiasing_applied = false; + + if (mode_flags & GGML_SCALE_FLAG_ANTIALIAS) { + // Only apply antialiasing when downsampling (scale < 1.0) + const float scale0 = (float)ne00 / (float)ne0; + const float scale1 = (float)ne01 / (float)ne1; + + if (scale0 > 1.0f || scale1 > 1.0f) { + // Apply antialiasing filter to src0 and write directly to dst + // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|) + auto bilinear_filter = [](float x) -> float { + x = fabsf(x); + if (x < 1.0f) { + return 1.0f - x; + } + return 0.0f; + }; + + const int interp_size = 2; // bilinear + + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + // Compute center position in source coordinates + const float center_y = scale1 * ((float)i1 + 0.5f); + + // Compute support and invscale for y direction + const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f; + const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f; + + for (int64_t i0 = 0; i0 < ne0; i0++) { + const float center_x = scale0 * ((float)i0 + 0.5f); + + // Compute support and invscale for x direction + const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f; + const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f; + + // Calculate the range of source pixels that contribute + const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f)); + const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f)); + const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f)); + const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f)); + + float val = 0.0f; + float total_weight = 0.0f; + + // Apply bilinear filter with antialiasing + for (int64_t sy = y_min; sy < y_max; sy++) { + const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y); + + for (int64_t sx = x_min; sx < x_max; sx++) { + const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x); + const float weight = weight_x * weight_y; + + if (weight > 0.0f) { + const float pixel = *(const float *)((const char *)src0->data + + sx*nb00 + + sy*nb01 + + i02*nb02 + + i03*nb03); + val += pixel * weight; + total_weight += weight; + } + } + } + + // Normalize by total weight + if (total_weight > 0.0f) { + val /= total_weight; + } + + // Write directly to dst + float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + *dst_ptr = val; + } + } + } + } + + antialiasing_applied = true; + } + } + + // If antialiasing was not applied, proceed with regular interpolation + if (antialiasing_applied) { + // Antialiasing result is already in dst, we're done + return; + } + if (mode == GGML_SCALE_MODE_NEAREST) { for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; @@ -7500,93 +7593,6 @@ static void ggml_compute_forward_upscale_f32( bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx), bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy); - float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - *y_dst = val; - } - } - } - } - } else if (mode == GGML_SCALE_MODE_BILINEAR_AA) { - // Bilinear with antialiasing - matches PyTorch's F.interpolate(..., mode='bilinear', antialias=True) - // This implementation follows PyTorch's approach: - // - scale = input_size / output_size (NOT output/input!) - // - For downsampling (scale > 1): support = 1.0 * scale, invscale = 1.0 / scale - // - For upsampling (scale <= 1): support = 1.0, invscale = 1.0 - // See: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/UpSampleKernel.cpp - - const int interp_size = 2; // bilinear - - // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|) - auto bilinear_filter = [](float x) -> float { - x = fabsf(x); - if (x < 1.0f) { - return 1.0f - x; - } - return 0.0f; - }; - - // Compute scales as input_size / output_size - const float scale0 = (float)ne00 / (float)ne0; - const float scale1 = (float)ne01 / (float)ne1; - - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - // Compute center position in source coordinates - // PyTorch formula: center = scale * (i + 0.5) - const float center_y = scale1 * ((float)i1 + 0.5f); - - // Compute support and invscale for y direction - // When downsampling (scale > 1), we need wider support for antialiasing - const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f; - const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f; - - for (int64_t i0 = 0; i0 < ne0; i0++) { - const float center_x = scale0 * ((float)i0 + 0.5f); - - // Compute support and invscale for x direction - const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f; - const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f; - - // Calculate the range of source pixels that contribute - const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f)); - const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f)); - const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f)); - const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f)); - - float val = 0.0f; - float total_weight = 0.0f; - - // Apply bilinear filter with antialiasing - for (int64_t sy = y_min; sy < y_max; sy++) { - // Compute bilinear weight for y direction - const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y); - - for (int64_t sx = x_min; sx < x_max; sx++) { - // Compute bilinear weight for x direction - const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x); - - const float weight = weight_x * weight_y; - - if (weight > 0.0f) { - const float pixel = *(const float *)((const char *)src0->data + - sx*nb00 + - sy*nb01 + - i02*nb02 + - i03*nb03); - val += pixel * weight; - total_weight += weight; - } - } - } - - // Normalize by total weight - if (total_weight > 0.0f) { - val /= total_weight; - } - float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); *y_dst = val; } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9cd37835451..06e469a626e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2012,7 +2012,7 @@ struct clip_graph { ggml_tensor * pos_embd = model.position_embeddings; const int height = img.ny / patch_size; const int width = img.nx / patch_size; - const uint32_t mode = GGML_SCALE_MODE_BILINEAR_AA; + const uint32_t mode = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS; const int n_per_side = (int)std::sqrt(pos_embd->ne[1]); GGML_ASSERT(pos_embd); From 65789e5b14450ea7b345222fa2df92164b541473 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 28 Nov 2025 15:56:18 +0100 Subject: [PATCH 06/13] Reformat --- ggml/src/ggml-cpu/ops.cpp | 136 ++++++++++++++------------------------ 1 file changed, 51 insertions(+), 85 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index f6f2805ef23..0210956372d 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7402,100 +7402,66 @@ static void ggml_compute_forward_upscale_f32( sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; } - // Antialiasing preprocessing step - // Apply antialiasing filter if flag is set and write directly to dst - bool antialiasing_applied = false; - - if (mode_flags & GGML_SCALE_FLAG_ANTIALIAS) { - // Only apply antialiasing when downsampling (scale < 1.0) - const float scale0 = (float)ne00 / (float)ne0; - const float scale1 = (float)ne01 / (float)ne1; - - if (scale0 > 1.0f || scale1 > 1.0f) { - // Apply antialiasing filter to src0 and write directly to dst - // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|) - auto bilinear_filter = [](float x) -> float { - x = fabsf(x); - if (x < 1.0f) { - return 1.0f - x; - } - return 0.0f; - }; - - const int interp_size = 2; // bilinear - - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - // Compute center position in source coordinates - const float center_y = scale1 * ((float)i1 + 0.5f); - - // Compute support and invscale for y direction - const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f; - const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f; - - for (int64_t i0 = 0; i0 < ne0; i0++) { - const float center_x = scale0 * ((float)i0 + 0.5f); - - // Compute support and invscale for x direction - const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f; - const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f; - - // Calculate the range of source pixels that contribute - const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f)); - const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f)); - const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f)); - const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f)); - - float val = 0.0f; - float total_weight = 0.0f; - - // Apply bilinear filter with antialiasing - for (int64_t sy = y_min; sy < y_max; sy++) { - const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y); - - for (int64_t sx = x_min; sx < x_max; sx++) { - const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x); - const float weight = weight_x * weight_y; - - if (weight > 0.0f) { - const float pixel = *(const float *)((const char *)src0->data + - sx*nb00 + - sy*nb01 + - i02*nb02 + - i03*nb03); - val += pixel * weight; - total_weight += weight; - } + // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) + // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp + if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) { + auto triangle_filter = [](float x) -> float { + return std::max(1.0f - fabsf(x), 0.f); + }; + + // support and invscale, maximum 1 pixel for bilinear + const float support1 = std::max(1.f, 1.f / sf1); + const float invscale1 = 1.0 / support1; + const float support0 = std::max(1.f, 1.f / sf0); + const float invscale0 = 1.f / support0; + + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const float y = ((float) i1 + pixel_offset) / sf1; + for (int64_t i0 = 0; i0 < ne0; i0++) { + const float x = ((float) i0 + pixel_offset) / sf0; + + // the range of source pixels that contribute + const int64_t x_min = std::max(int64_t(0), (int64_t) (x - support0 + pixel_offset)); + const int64_t x_max = std::min(ne00, (int64_t) (x + support0 + pixel_offset)); + const int64_t y_min = std::max(int64_t(0), (int64_t) (y - support1 + pixel_offset)); + const int64_t y_max = std::min(ne01, (int64_t) (y + support1 + pixel_offset)); + + // bilinear filter with antialiasing + float val = 0.0f; + float total_weight = 0.0f; + + for (int64_t sy = y_min; sy < y_max; sy++) { + const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1); + + for (int64_t sx = x_min; sx < x_max; sx++) { + const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0); + const float weight = weight_x * weight_y; + + if (weight <= 0.0f) { + continue; } - } - // Normalize by total weight - if (total_weight > 0.0f) { - val /= total_weight; + const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03); + val += pixel * weight; + total_weight += weight; } + } - // Write directly to dst - float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - *dst_ptr = val; + if (total_weight > 0.0f) { + val /= total_weight; } + + float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + *dst_ptr = val; } } } - - antialiasing_applied = true; } - } - - // If antialiasing was not applied, proceed with regular interpolation - if (antialiasing_applied) { - // Antialiasing result is already in dst, we're done - return; - } - - if (mode == GGML_SCALE_MODE_NEAREST) { + } else if (mode == GGML_SCALE_MODE_NEAREST) { for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; for (int64_t i2 = ith; i2 < ne2; i2 += nth) { From 7cf67d6cef1972e216491dd4b2f1995dc673fe55 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 28 Nov 2025 16:30:57 +0100 Subject: [PATCH 07/13] Cuda implementation of antialias kernel --- ggml/src/ggml-cpu/ops.cpp | 8 ++-- ggml/src/ggml-cuda/upscale.cu | 81 +++++++++++++++++++++++++++++++++-- tests/test-backend-ops.cpp | 2 +- 3 files changed, 83 insertions(+), 8 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 0210956372d..531292d3d5d 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7425,10 +7425,10 @@ static void ggml_compute_forward_upscale_f32( const float x = ((float) i0 + pixel_offset) / sf0; // the range of source pixels that contribute - const int64_t x_min = std::max(int64_t(0), (int64_t) (x - support0 + pixel_offset)); - const int64_t x_max = std::min(ne00, (int64_t) (x + support0 + pixel_offset)); - const int64_t y_min = std::max(int64_t(0), (int64_t) (y - support1 + pixel_offset)); - const int64_t y_max = std::min(ne01, (int64_t) (y + support1 + pixel_offset)); + const int64_t x_min = std::max(x - support0 + pixel_offset, 0); + const int64_t x_max = std::min(x + support0 + pixel_offset, ne00); + const int64_t y_min = std::max(y - support1 + pixel_offset, 0); + const int64_t y_max = std::min(y + support1 + pixel_offset, ne01); // bilinear filter with antialiasing float val = 0.0f; diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index 687c669304d..9b9c6a2579a 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -81,6 +81,76 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst, dst[index] = result; } +// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) +// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp +static __global__ void upscale_f32_bilinear_antialias(const float * src0, float * dst, + const int nb00, const int nb01, const int nb02, const int nb03, + const int ne00_src, const int ne01_src, + const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, + const float sf0, const float sf1, const float sf2, const float sf3, + const float pixel_offset) { + const int64_t index = threadIdx.x + blockIdx.x * blockDim.x; + const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst; + + if (index >= dst_total_elements) { + return; + } + + const int i10_dst = index % ne10_dst; + const int i11_dst = (index / ne10_dst) % ne11_dst; + const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst; + const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst); + + const int i02_src = (int)(i12_dst / sf2); + const int i03_src = (int)(i13_dst / sf3); + + const float y = ((float)i11_dst + pixel_offset) / sf1; + const float x = ((float)i10_dst + pixel_offset) / sf0; + + // support and invscale, maximum 1 pixel for bilinear + const float support1 = max(1.f / sf1, 1.f); + const float invscale1 = 1.0 / support1; + const float support0 = max(1.f / sf0, 1.f); + const float invscale0 = 1.f / support0; + + // the range of source pixels that contribute + const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset)); + const int64_t x_max = min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset)); + const int64_t y_min = max(int64_t(0), int64_t(y - support1 + pixel_offset)); + const int64_t y_max = min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset)); + + // bilinear filter with antialiasing + float val = 0.0f; + float total_weight = 0.0f; + + auto triangle_filter = [](float x) -> float { + return max(1.0f - fabsf(x), 0.f); + }; + + for (int64_t sy = y_min; sy < y_max; sy++) { + const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1); + + for (int64_t sx = x_min; sx < x_max; sx++) { + const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0); + const float weight = weight_x * weight_y; + + if (weight <= 0.0f) { + continue; + } + + const float pixel = *(const float *)((const char *)src0 + sx*nb00 + sy*nb01 + i02_src*nb02 + i03_src*nb03); + val += pixel * weight; + total_weight += weight; + } + } + + if (total_weight > 0.0f) { + val /= total_weight; + } + + dst[index] = val; +} + namespace bicubic_interpolation { // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm __device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch) @@ -161,11 +231,15 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst, const int ne00_src, const int ne01_src, const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst, const float sf0, const float sf1, const float sf2, const float sf3, - const float pixel_offset, cudaStream_t stream) { + const float pixel_offset, bool antialias, cudaStream_t stream) { const int64_t dst_size = ne10_dst * ne11_dst * ne12_dst * ne13_dst; const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; - upscale_f32_bilinear<<>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + if (antialias) { + upscale_f32_bilinear_antialias<<>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + } else { + upscale_f32_bilinear<<>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset); + } } static void upscale_f32_bicubic_cuda(const float * x, float * dst, @@ -207,9 +281,10 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if (mode == GGML_SCALE_MODE_NEAREST) { upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream); } else if (mode == GGML_SCALE_MODE_BILINEAR) { + bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS); upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - sf0, sf1, sf2, sf3, pixel_offset, stream); + sf0, sf1, sf2, sf3, pixel_offset, antialias, stream); } else if (mode == GGML_SCALE_MODE_BICUBIC) { upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 87a61aa1224..9645d0b3909 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -7660,7 +7660,7 @@ static std::vector> make_test_cases_eval() { // test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {i, 2, 1, 3}, rand() % i + 1)); //} - for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) { + for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) { test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode)); test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true)); test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5, 7, 11}, {5, 7, 11, 13}, mode)); From 7c8b09868c6fce98fa3fa94f19ee0aca52080179 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 28 Nov 2025 16:32:25 +0100 Subject: [PATCH 08/13] Change placement in ops.cpp --- ggml/src/ggml-cpu/ops.cpp | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 531292d3d5d..07f651f1a00 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7402,9 +7402,27 @@ static void ggml_compute_forward_upscale_f32( sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1; } - // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) - // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp - if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) { + if (mode == GGML_SCALE_MODE_NEAREST) { + for (int64_t i3 = 0; i3 < ne3; i3++) { + const int64_t i03 = i3 / sf3; + for (int64_t i2 = ith; i2 < ne2; i2 += nth) { + const int64_t i02 = i2 / sf2; + for (int64_t i1 = 0; i1 < ne1; i1++) { + const int64_t i01 = i1 / sf1; + for (int64_t i0 = 0; i0 < ne0; i0++) { + const int64_t i00 = i0 / sf0; + + const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); + + *y = *x; + } + } + } + } + } else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) { + // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) + // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp auto triangle_filter = [](float x) -> float { return std::max(1.0f - fabsf(x), 0.f); }; @@ -7461,24 +7479,6 @@ static void ggml_compute_forward_upscale_f32( } } } - } else if (mode == GGML_SCALE_MODE_NEAREST) { - for (int64_t i3 = 0; i3 < ne3; i3++) { - const int64_t i03 = i3 / sf3; - for (int64_t i2 = ith; i2 < ne2; i2 += nth) { - const int64_t i02 = i2 / sf2; - for (int64_t i1 = 0; i1 < ne1; i1++) { - const int64_t i01 = i1 / sf1; - for (int64_t i0 = 0; i0 < ne0; i0++) { - const int64_t i00 = i0 / sf0; - - const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); - float * y = (float *)((char *) dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3); - - *y = *x; - } - } - } - } } else if (mode == GGML_SCALE_MODE_BILINEAR) { for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; From 3ea706e94386b960237f3461ce9658c36e5530f4 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 28 Nov 2025 16:45:58 +0100 Subject: [PATCH 09/13] consistent float literals --- ggml/src/ggml-cpu/ops.cpp | 10 +++++----- ggml/src/ggml-cuda/upscale.cu | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 07f651f1a00..32f937cad48 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7424,14 +7424,14 @@ static void ggml_compute_forward_upscale_f32( // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True) // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp auto triangle_filter = [](float x) -> float { - return std::max(1.0f - fabsf(x), 0.f); + return std::max(1.0f - fabsf(x), 0.0f); }; // support and invscale, maximum 1 pixel for bilinear - const float support1 = std::max(1.f, 1.f / sf1); - const float invscale1 = 1.0 / support1; - const float support0 = std::max(1.f, 1.f / sf0); - const float invscale0 = 1.f / support0; + const float support1 = std::max(1.0f, 1.0f / sf1); + const float invscale1 = 1.0f / support1; + const float support0 = std::max(1.0f, 1.0f / sf0); + const float invscale0 = 1.0f / support0; for (int64_t i3 = 0; i3 < ne3; i3++) { const int64_t i03 = i3 / sf3; diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index 9b9c6a2579a..7d4a617e3c9 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -108,10 +108,10 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float const float x = ((float)i10_dst + pixel_offset) / sf0; // support and invscale, maximum 1 pixel for bilinear - const float support1 = max(1.f / sf1, 1.f); - const float invscale1 = 1.0 / support1; - const float support0 = max(1.f / sf0, 1.f); - const float invscale0 = 1.f / support0; + const float support1 = max(1.0f / sf1, 1.0f); + const float invscale1 = 1.0f / support1; + const float support0 = max(1.0f / sf0, 1.0f); + const float invscale0 = 1.0f / support0; // the range of source pixels that contribute const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset)); @@ -124,7 +124,7 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float float total_weight = 0.0f; auto triangle_filter = [](float x) -> float { - return max(1.0f - fabsf(x), 0.f); + return max(1.0f - fabsf(x), 0.0f); }; for (int64_t sy = y_min; sy < y_max; sy++) { From 0b14906ab7b39039dbbc61de39e58a5fa3cd21e2 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 28 Nov 2025 17:03:12 +0100 Subject: [PATCH 10/13] Pad only for LFM2 --- tools/mtmd/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 06e469a626e..490d3ab0ec7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -4355,9 +4355,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str params.patch_size * params.n_merge, params.image_min_pixels, params.image_max_pixels); + const std::array pad_color = {122, 116, 104}; clip_image_u8 resized_img; - img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, false); + const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2); + img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color); clip_image_f32_ptr res(clip_image_f32_init()); normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std); res_imgs->entries.push_back(std::move(res)); From b81928f9795da0a7cb8af1592cb018a2e59f1ba6 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sun, 30 Nov 2025 11:16:22 +0100 Subject: [PATCH 11/13] Address PR feedback --- ggml/src/ggml-cpu/ops.cpp | 2 +- ggml/src/ggml-cuda/upscale.cu | 4 ++-- ggml/src/ggml.c | 2 ++ tools/mtmd/clip.cpp | 1 + 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 32f937cad48..608e82af69f 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7427,7 +7427,7 @@ static void ggml_compute_forward_upscale_f32( return std::max(1.0f - fabsf(x), 0.0f); }; - // support and invscale, maximum 1 pixel for bilinear + // support and invscale, minimum 1 pixel for bilinear const float support1 = std::max(1.0f, 1.0f / sf1); const float invscale1 = 1.0f / support1; const float support0 = std::max(1.0f, 1.0f / sf0); diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu index 7d4a617e3c9..6bdf3cd996b 100644 --- a/ggml/src/ggml-cuda/upscale.cu +++ b/ggml/src/ggml-cuda/upscale.cu @@ -107,7 +107,7 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float const float y = ((float)i11_dst + pixel_offset) / sf1; const float x = ((float)i10_dst + pixel_offset) / sf0; - // support and invscale, maximum 1 pixel for bilinear + // support and invscale, minimum 1 pixel for bilinear const float support1 = max(1.0f / sf1, 1.0f); const float invscale1 = 1.0f / support1; const float support0 = max(1.0f / sf0, 1.0f); @@ -281,7 +281,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { if (mode == GGML_SCALE_MODE_NEAREST) { upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream); } else if (mode == GGML_SCALE_MODE_BILINEAR) { - bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS); + const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS); upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, pixel_offset, antialias, stream); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b99345a2e93..17cf4d84bb8 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4891,6 +4891,8 @@ static struct ggml_tensor * ggml_interpolate_impl( int64_t ne3, uint32_t mode) { GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT); + // TODO: implement antialias for modes other than bilinear + GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR); struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 490d3ab0ec7..db477bbbe9b 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2787,6 +2787,7 @@ struct clip_model_loader { { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false); // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json + // config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64 hparams.set_limit_image_tokens(64, 1024); } break; case PROJECTOR_TYPE_PIXTRAL: From 31be1a9fd11fadfa1d5df9bc0709d5f38ba40137 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sun, 30 Nov 2025 11:18:49 +0100 Subject: [PATCH 12/13] Rollback default marker placement changes --- tools/mtmd/mtmd-cli.cpp | 4 ++-- tools/mtmd/mtmd.cpp | 9 --------- tools/mtmd/mtmd.h | 3 --- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index fff8714bfe3..6679de309b4 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -313,7 +313,7 @@ int main(int argc, char ** argv) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { for (size_t i = 0; i < params.image.size(); i++) { - params.prompt = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), params.prompt); + params.prompt += mtmd_default_marker(); } } common_chat_msg msg; @@ -378,7 +378,7 @@ int main(int argc, char ** argv) { std::string media_path = line.substr(7); if (ctx.load_media(media_path)) { LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio"); - content = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), content); + content += mtmd_default_marker(); } // else, error is already printed by libmtmd continue; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index e616c464d05..6690bf30046 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1103,12 +1103,3 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default; g_logger_state.log_callback_user_data = user_data; } - -std::string mtmd::mtmd_add_default_marker(mtmd_context *ctx, const std::string &str) { - // for LFM2 image embeddings positioned before the text - if (ctx && ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_LFM2) { - return mtmd_default_marker() + str; - } - - return str + mtmd_default_marker(); -} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index d204bd594db..015119be897 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -299,9 +299,6 @@ struct input_chunks { } }; -// insert mtmd_default_marker() into given string, position depends on the projector -std::string mtmd_add_default_marker(mtmd_context *ctx, const std::string &str); - } // namespace mtmd #endif From 2385ecf1b7b2952a81e335d2be80bba18d611c5d Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sun, 30 Nov 2025 11:27:12 +0100 Subject: [PATCH 13/13] Fallback to CPU implementation for antialias implementation of upscale --- ggml/src/ggml-cann/ggml-cann.cpp | 3 +++ ggml/src/ggml-metal/ggml-metal-device.m | 2 +- ggml/src/ggml-opencl/ggml-opencl.cpp | 3 ++- ggml/src/ggml-sycl/ggml-sycl.cpp | 2 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 1 + 5 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index df28d67fb0b..cd1b5e5b944 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2500,6 +2500,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) { return false; } + if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) { + return false; + } return true; } case GGML_OP_POOL_2D: diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 09b1b503118..3aad16a3ff7 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -894,7 +894,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_POOL_1D: return false; case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS); case GGML_OP_POOL_2D: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_PAD: diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index e5302f4550e..277a30d30ed 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -3086,8 +3086,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_UPSCALE: { ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF); + const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS); return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && - (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR); + (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias; } case GGML_OP_CONV_2D: return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) || diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3f1bdfb9f1b..e82b51206e2 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4597,7 +4597,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_IM2COL: return true; case GGML_OP_UPSCALE: - return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; + return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS); case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 66dd0bfabd2..95966ce1d8e 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -14113,6 +14113,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm } return true; case GGML_OP_UPSCALE: + return op->src[0]->type == GGML_TYPE_F32 && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS); case GGML_OP_ACC: return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_CONCAT: