From 2386891b0671bc34aac27f3f8e214bb50644c017 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Tue, 25 Nov 2025 18:55:14 +0100
Subject: [PATCH 01/13] Adjust to pytorch

---
 tools/mtmd/clip.cpp | 8 ++++----
 tools/mtmd/mtmd.cpp | 4 ++++
 2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 52ea542decc..18f62bf5868 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3737,12 +3737,13 @@ struct img_tool {
         const int width  = inp_size.width;
         const int height = inp_size.height;
 
+        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
         auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
         auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
 
         // always align up first
-        int h_bar = std::max(align_size, ceil_by_factor(height));
-        int w_bar = std::max(align_size, ceil_by_factor(width));
+        int h_bar = std::max(align_size, round_by_factor(height));
+        int w_bar = std::max(align_size, round_by_factor(width));
 
         if (h_bar * w_bar > max_pixels) {
             const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
@@ -4354,10 +4355,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     params.patch_size * params.n_merge,
                     params.image_min_pixels,
                     params.image_max_pixels);
-                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                 clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, false);
                 clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(res));
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index dfad9cd7957..6690bf30046 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -304,6 +304,10 @@ struct mtmd_context {
             img_beg = "<|im_start|>";
             img_end = "<|im_end|>";
 
+        } else if (proj == PROJECTOR_TYPE_LFM2) {
+            img_beg = "<|image_start|>";
+            img_end = "<|image_end|>";
+
         }
     }
 

From c5090733d81cec544c867b6da91f253ce472b115 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Thu, 27 Nov 2025 13:23:02 +0100
Subject: [PATCH 02/13] Add antialiasing upscale

---
 ggml/include/ggml.h       |  7 ++--
 ggml/src/ggml-cpu/ops.cpp | 87 +++++++++++++++++++++++++++++++++++++++
 tools/mtmd/clip.cpp       |  2 +-
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 4dbca868bc7..1bffb35ae9f 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2140,9 +2140,10 @@ extern "C" {
             float                 p1);
 
     enum ggml_scale_mode {
-        GGML_SCALE_MODE_NEAREST  = 0,
-        GGML_SCALE_MODE_BILINEAR = 1,
-        GGML_SCALE_MODE_BICUBIC  = 2,
+        GGML_SCALE_MODE_NEAREST     = 0,
+        GGML_SCALE_MODE_BILINEAR    = 1,
+        GGML_SCALE_MODE_BICUBIC     = 2,
+        GGML_SCALE_MODE_BILINEAR_AA = 3, // bilinear with antialiasing (box filter for downsampling)
 
         GGML_SCALE_MODE_COUNT
     };
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 2745fc54e15..cf2ccfd2fb9 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7500,6 +7500,93 @@ static void ggml_compute_forward_upscale_f32(
                             bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
                             bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
 
+                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *y_dst = val;
+                    }
+                }
+            }
+        }
+    } else if (mode == GGML_SCALE_MODE_BILINEAR_AA) {
+        // Bilinear with antialiasing - matches PyTorch's F.interpolate(..., mode='bilinear', antialias=True)
+        // This implementation follows PyTorch's approach:
+        // - scale = input_size / output_size (NOT output/input!)
+        // - For downsampling (scale > 1): support = 1.0 * scale, invscale = 1.0 / scale
+        // - For upsampling (scale <= 1): support = 1.0, invscale = 1.0
+        // See: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+
+        const int interp_size = 2;  // bilinear
+
+        // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|)
+        auto bilinear_filter = [](float x) -> float {
+            x = fabsf(x);
+            if (x < 1.0f) {
+                return 1.0f - x;
+            }
+            return 0.0f;
+        };
+
+        // Compute scales as input_size / output_size
+        const float scale0 = (float)ne00 / (float)ne0;
+        const float scale1 = (float)ne01 / (float)ne1;
+
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    // Compute center position in source coordinates
+                    // PyTorch formula: center = scale * (i + 0.5)
+                    const float center_y = scale1 * ((float)i1 + 0.5f);
+
+                    // Compute support and invscale for y direction
+                    // When downsampling (scale > 1), we need wider support for antialiasing
+                    const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f;
+                    const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f;
+
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float center_x = scale0 * ((float)i0 + 0.5f);
+
+                        // Compute support and invscale for x direction
+                        const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f;
+                        const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f;
+
+                        // Calculate the range of source pixels that contribute
+                        const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f));
+                        const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f));
+                        const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f));
+                        const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f));
+
+                        float val = 0.0f;
+                        float total_weight = 0.0f;
+
+                        // Apply bilinear filter with antialiasing
+                        for (int64_t sy = y_min; sy < y_max; sy++) {
+                            // Compute bilinear weight for y direction
+                            const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y);
+
+                            for (int64_t sx = x_min; sx < x_max; sx++) {
+                                // Compute bilinear weight for x direction
+                                const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x);
+
+                                const float weight = weight_x * weight_y;
+
+                                if (weight > 0.0f) {
+                                    const float pixel = *(const float *)((const char *)src0->data +
+                                                                         sx*nb00 +
+                                                                         sy*nb01 +
+                                                                         i02*nb02 +
+                                                                         i03*nb03);
+                                    val += pixel * weight;
+                                    total_weight += weight;
+                                }
+                            }
+                        }
+
+                        // Normalize by total weight
+                        if (total_weight > 0.0f) {
+                            val /= total_weight;
+                        }
+
                         float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
                         *y_dst = val;
                     }
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 18f62bf5868..f571a6102c8 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2012,7 +2012,7 @@ struct clip_graph {
         ggml_tensor * pos_embd = model.position_embeddings;
         const int height       = img.ny / patch_size;
         const int width        = img.nx / patch_size;
-        const uint32_t mode    = GGML_SCALE_MODE_BILINEAR;
+        const uint32_t mode    = GGML_SCALE_MODE_BILINEAR_AA;
         const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
 
         GGML_ASSERT(pos_embd);

From 80b4e97e3aac43d1dc27fab79271c43fa57f643e Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Thu, 27 Nov 2025 13:23:33 +0100
Subject: [PATCH 03/13] Increase number of patches to 1024

---
 tools/mtmd/clip.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f571a6102c8..9cd37835451 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2787,7 +2787,7 @@ struct clip_model_loader {
                     {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
-                        hparams.set_limit_image_tokens(64, 256);
+                        hparams.set_limit_image_tokens(64, 1024);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:
                 case PROJECTOR_TYPE_LIGHTONOCR:

From 1cd4e2fd59185c76ac6e45b65b93288eae3f56dc Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Thu, 27 Nov 2025 13:54:04 +0100
Subject: [PATCH 04/13] Handle default marker insertion for LFM2

---
 tools/mtmd/mtmd-cli.cpp | 4 ++--
 tools/mtmd/mtmd.cpp     | 9 +++++++++
 tools/mtmd/mtmd.h       | 3 +++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 6679de309b4..fff8714bfe3 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -313,7 +313,7 @@ int main(int argc, char ** argv) {
         g_is_generating = true;
         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
             for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt += mtmd_default_marker();
+                params.prompt = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), params.prompt);
             }
         }
         common_chat_msg msg;
@@ -378,7 +378,7 @@ int main(int argc, char ** argv) {
                 std::string media_path = line.substr(7);
                 if (ctx.load_media(media_path)) {
                     LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
-                    content += mtmd_default_marker();
+                    content = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), content);
                 }
                 // else, error is already printed by libmtmd
                 continue;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 6690bf30046..e616c464d05 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1103,3 +1103,12 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
+
+std::string mtmd::mtmd_add_default_marker(mtmd_context *ctx, const std::string &str) {
+    // for LFM2 image embeddings positioned before the text
+    if (ctx && ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_LFM2) {
+        return mtmd_default_marker() + str;
+    }
+
+    return str + mtmd_default_marker();
+}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index 015119be897..d204bd594db 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -299,6 +299,9 @@ struct input_chunks {
     }
 };
 
+// insert mtmd_default_marker() into given string, position depends on the projector
+std::string mtmd_add_default_marker(mtmd_context *ctx, const std::string &str);
+
 } // namespace mtmd
 
 #endif

From 40e08b83a8f67f2ac5b135612afabc7f215ca0e4 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Thu, 27 Nov 2025 15:06:12 +0100
Subject: [PATCH 05/13] Switch to flag

---
 ggml/include/ggml.h       |  10 +--
 ggml/src/ggml-cpu/ops.cpp | 180 ++++++++++++++++++++------------------
 tools/mtmd/clip.cpp       |   2 +-
 3 files changed, 99 insertions(+), 93 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 1bffb35ae9f..48da68fe7e3 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2140,16 +2140,16 @@ extern "C" {
             float                 p1);
 
     enum ggml_scale_mode {
-        GGML_SCALE_MODE_NEAREST     = 0,
-        GGML_SCALE_MODE_BILINEAR    = 1,
-        GGML_SCALE_MODE_BICUBIC     = 2,
-        GGML_SCALE_MODE_BILINEAR_AA = 3, // bilinear with antialiasing (box filter for downsampling)
+        GGML_SCALE_MODE_NEAREST  = 0,
+        GGML_SCALE_MODE_BILINEAR = 1,
+        GGML_SCALE_MODE_BICUBIC  = 2,
 
         GGML_SCALE_MODE_COUNT
     };
 
     enum ggml_scale_flag {
-        GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8)
+        GGML_SCALE_FLAG_ALIGN_CORNERS = (1 << 8),
+        GGML_SCALE_FLAG_ANTIALIAS     = (1 << 9),
     };
 
     // interpolate
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index cf2ccfd2fb9..f6f2805ef23 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7402,6 +7402,99 @@ static void ggml_compute_forward_upscale_f32(
         sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
     }
 
+    // Antialiasing preprocessing step
+    // Apply antialiasing filter if flag is set and write directly to dst
+    bool antialiasing_applied = false;
+
+    if (mode_flags & GGML_SCALE_FLAG_ANTIALIAS) {
+        // Only apply antialiasing when downsampling (scale < 1.0)
+        const float scale0 = (float)ne00 / (float)ne0;
+        const float scale1 = (float)ne01 / (float)ne1;
+
+        if (scale0 > 1.0f || scale1 > 1.0f) {
+            // Apply antialiasing filter to src0 and write directly to dst
+            // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|)
+            auto bilinear_filter = [](float x) -> float {
+                x = fabsf(x);
+                if (x < 1.0f) {
+                    return 1.0f - x;
+                }
+                return 0.0f;
+            };
+
+            const int interp_size = 2;  // bilinear
+
+            for (int64_t i3 = 0; i3 < ne3; i3++) {
+                const int64_t i03 = i3 / sf3;
+                for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                    const int64_t i02 = i2 / sf2;
+                    for (int64_t i1 = 0; i1 < ne1; i1++) {
+                        // Compute center position in source coordinates
+                        const float center_y = scale1 * ((float)i1 + 0.5f);
+
+                        // Compute support and invscale for y direction
+                        const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f;
+                        const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f;
+
+                        for (int64_t i0 = 0; i0 < ne0; i0++) {
+                            const float center_x = scale0 * ((float)i0 + 0.5f);
+
+                            // Compute support and invscale for x direction
+                            const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f;
+                            const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f;
+
+                            // Calculate the range of source pixels that contribute
+                            const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f));
+                            const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f));
+                            const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f));
+                            const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f));
+
+                            float val = 0.0f;
+                            float total_weight = 0.0f;
+
+                            // Apply bilinear filter with antialiasing
+                            for (int64_t sy = y_min; sy < y_max; sy++) {
+                                const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y);
+
+                                for (int64_t sx = x_min; sx < x_max; sx++) {
+                                    const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x);
+                                    const float weight = weight_x * weight_y;
+
+                                    if (weight > 0.0f) {
+                                        const float pixel = *(const float *)((const char *)src0->data +
+                                                                             sx*nb00 +
+                                                                             sy*nb01 +
+                                                                             i02*nb02 +
+                                                                             i03*nb03);
+                                        val += pixel * weight;
+                                        total_weight += weight;
+                                    }
+                                }
+                            }
+
+                            // Normalize by total weight
+                            if (total_weight > 0.0f) {
+                                val /= total_weight;
+                            }
+
+                            // Write directly to dst
+                            float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                            *dst_ptr = val;
+                        }
+                    }
+                }
+            }
+
+            antialiasing_applied = true;
+        }
+    }
+
+    // If antialiasing was not applied, proceed with regular interpolation
+    if (antialiasing_applied) {
+        // Antialiasing result is already in dst, we're done
+        return;
+    }
+
     if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
@@ -7500,93 +7593,6 @@ static void ggml_compute_forward_upscale_f32(
                             bicubic(p(-1, 1), p(0, 1), p(1, 1), p(2, 1), dx),
                             bicubic(p(-1, 2), p(0, 2), p(1, 2), p(2, 2), dx), dy);
 
-                        float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                        *y_dst = val;
-                    }
-                }
-            }
-        }
-    } else if (mode == GGML_SCALE_MODE_BILINEAR_AA) {
-        // Bilinear with antialiasing - matches PyTorch's F.interpolate(..., mode='bilinear', antialias=True)
-        // This implementation follows PyTorch's approach:
-        // - scale = input_size / output_size (NOT output/input!)
-        // - For downsampling (scale > 1): support = 1.0 * scale, invscale = 1.0 / scale
-        // - For upsampling (scale <= 1): support = 1.0, invscale = 1.0
-        // See: https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/cpu/UpSampleKernel.cpp
-
-        const int interp_size = 2;  // bilinear
-
-        // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|)
-        auto bilinear_filter = [](float x) -> float {
-            x = fabsf(x);
-            if (x < 1.0f) {
-                return 1.0f - x;
-            }
-            return 0.0f;
-        };
-
-        // Compute scales as input_size / output_size
-        const float scale0 = (float)ne00 / (float)ne0;
-        const float scale1 = (float)ne01 / (float)ne1;
-
-        for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
-            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
-                for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    // Compute center position in source coordinates
-                    // PyTorch formula: center = scale * (i + 0.5)
-                    const float center_y = scale1 * ((float)i1 + 0.5f);
-
-                    // Compute support and invscale for y direction
-                    // When downsampling (scale > 1), we need wider support for antialiasing
-                    const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f;
-                    const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f;
-
-                    for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const float center_x = scale0 * ((float)i0 + 0.5f);
-
-                        // Compute support and invscale for x direction
-                        const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f;
-                        const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f;
-
-                        // Calculate the range of source pixels that contribute
-                        const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f));
-                        const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f));
-                        const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f));
-                        const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f));
-
-                        float val = 0.0f;
-                        float total_weight = 0.0f;
-
-                        // Apply bilinear filter with antialiasing
-                        for (int64_t sy = y_min; sy < y_max; sy++) {
-                            // Compute bilinear weight for y direction
-                            const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y);
-
-                            for (int64_t sx = x_min; sx < x_max; sx++) {
-                                // Compute bilinear weight for x direction
-                                const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x);
-
-                                const float weight = weight_x * weight_y;
-
-                                if (weight > 0.0f) {
-                                    const float pixel = *(const float *)((const char *)src0->data +
-                                                                         sx*nb00 +
-                                                                         sy*nb01 +
-                                                                         i02*nb02 +
-                                                                         i03*nb03);
-                                    val += pixel * weight;
-                                    total_weight += weight;
-                                }
-                            }
-                        }
-
-                        // Normalize by total weight
-                        if (total_weight > 0.0f) {
-                            val /= total_weight;
-                        }
-
                         float * y_dst = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
                         *y_dst = val;
                     }
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 9cd37835451..06e469a626e 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2012,7 +2012,7 @@ struct clip_graph {
         ggml_tensor * pos_embd = model.position_embeddings;
         const int height       = img.ny / patch_size;
         const int width        = img.nx / patch_size;
-        const uint32_t mode    = GGML_SCALE_MODE_BILINEAR_AA;
+        const uint32_t mode    = GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS;
         const int n_per_side   = (int)std::sqrt(pos_embd->ne[1]);
 
         GGML_ASSERT(pos_embd);

From 65789e5b14450ea7b345222fa2df92164b541473 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 28 Nov 2025 15:56:18 +0100
Subject: [PATCH 06/13] Reformat

---
 ggml/src/ggml-cpu/ops.cpp | 136 ++++++++++++++------------------------
 1 file changed, 51 insertions(+), 85 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index f6f2805ef23..0210956372d 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7402,100 +7402,66 @@ static void ggml_compute_forward_upscale_f32(
         sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
     }
 
-    // Antialiasing preprocessing step
-    // Apply antialiasing filter if flag is set and write directly to dst
-    bool antialiasing_applied = false;
-
-    if (mode_flags & GGML_SCALE_FLAG_ANTIALIAS) {
-        // Only apply antialiasing when downsampling (scale < 1.0)
-        const float scale0 = (float)ne00 / (float)ne0;
-        const float scale1 = (float)ne01 / (float)ne1;
-
-        if (scale0 > 1.0f || scale1 > 1.0f) {
-            // Apply antialiasing filter to src0 and write directly to dst
-            // PyTorch's bilinear filter function: f(x) = max(0, 1 - |x|)
-            auto bilinear_filter = [](float x) -> float {
-                x = fabsf(x);
-                if (x < 1.0f) {
-                    return 1.0f - x;
-                }
-                return 0.0f;
-            };
-
-            const int interp_size = 2;  // bilinear
-
-            for (int64_t i3 = 0; i3 < ne3; i3++) {
-                const int64_t i03 = i3 / sf3;
-                for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                    const int64_t i02 = i2 / sf2;
-                    for (int64_t i1 = 0; i1 < ne1; i1++) {
-                        // Compute center position in source coordinates
-                        const float center_y = scale1 * ((float)i1 + 0.5f);
-
-                        // Compute support and invscale for y direction
-                        const float support_y = (scale1 > 1.0f) ? (interp_size * 0.5f) * scale1 : interp_size * 0.5f;
-                        const float invscale_y = (scale1 > 1.0f) ? (1.0f / scale1) : 1.0f;
-
-                        for (int64_t i0 = 0; i0 < ne0; i0++) {
-                            const float center_x = scale0 * ((float)i0 + 0.5f);
-
-                            // Compute support and invscale for x direction
-                            const float support_x = (scale0 > 1.0f) ? (interp_size * 0.5f) * scale0 : interp_size * 0.5f;
-                            const float invscale_x = (scale0 > 1.0f) ? (1.0f / scale0) : 1.0f;
-
-                            // Calculate the range of source pixels that contribute
-                            const int64_t x_min = std::max(int64_t(0), (int64_t)(center_x - support_x + 0.5f));
-                            const int64_t x_max = std::min(ne00, (int64_t)(center_x + support_x + 0.5f));
-                            const int64_t y_min = std::max(int64_t(0), (int64_t)(center_y - support_y + 0.5f));
-                            const int64_t y_max = std::min(ne01, (int64_t)(center_y + support_y + 0.5f));
-
-                            float val = 0.0f;
-                            float total_weight = 0.0f;
-
-                            // Apply bilinear filter with antialiasing
-                            for (int64_t sy = y_min; sy < y_max; sy++) {
-                                const float weight_y = bilinear_filter((sy - center_y + 0.5f) * invscale_y);
-
-                                for (int64_t sx = x_min; sx < x_max; sx++) {
-                                    const float weight_x = bilinear_filter((sx - center_x + 0.5f) * invscale_x);
-                                    const float weight = weight_x * weight_y;
-
-                                    if (weight > 0.0f) {
-                                        const float pixel = *(const float *)((const char *)src0->data +
-                                                                             sx*nb00 +
-                                                                             sy*nb01 +
-                                                                             i02*nb02 +
-                                                                             i03*nb03);
-                                        val += pixel * weight;
-                                        total_weight += weight;
-                                    }
+    // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
+    // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+    if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
+        auto triangle_filter = [](float x) -> float {
+            return std::max(1.0f - fabsf(x), 0.f);
+        };
+
+        // support and invscale, maximum 1 pixel for bilinear
+        const float support1  = std::max(1.f, 1.f / sf1);
+        const float invscale1 = 1.0 / support1;
+        const float support0  = std::max(1.f, 1.f / sf0);
+        const float invscale0 = 1.f / support0;
+
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const float y = ((float) i1 + pixel_offset) / sf1;
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const float x = ((float) i0 + pixel_offset) / sf0;
+
+                        // the range of source pixels that contribute
+                        const int64_t x_min = std::max(int64_t(0), (int64_t) (x - support0 + pixel_offset));
+                        const int64_t x_max = std::min(ne00, (int64_t) (x + support0 + pixel_offset));
+                        const int64_t y_min = std::max(int64_t(0), (int64_t) (y - support1 + pixel_offset));
+                        const int64_t y_max = std::min(ne01, (int64_t) (y + support1 + pixel_offset));
+
+                        // bilinear filter with antialiasing
+                        float val = 0.0f;
+                        float total_weight = 0.0f;
+
+                        for (int64_t sy = y_min; sy < y_max; sy++) {
+                            const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
+
+                            for (int64_t sx = x_min; sx < x_max; sx++) {
+                                const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
+                                const float weight = weight_x * weight_y;
+
+                                if (weight <= 0.0f) {
+                                    continue;
                                 }
-                            }
 
-                            // Normalize by total weight
-                            if (total_weight > 0.0f) {
-                                val /= total_weight;
+                                const float pixel = *(const float *)((const char *)src0->data + sx*nb00 + sy*nb01 + i02*nb02 + i03*nb03);
+                                val += pixel * weight;
+                                total_weight += weight;
                             }
+                        }
 
-                            // Write directly to dst
-                            float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
-                            *dst_ptr = val;
+                        if (total_weight > 0.0f) {
+                            val /= total_weight;
                         }
+
+                        float * dst_ptr = (float *)((char *)dst->data + i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3);
+                        *dst_ptr = val;
                     }
                 }
             }
-
-            antialiasing_applied = true;
         }
-    }
-
-    // If antialiasing was not applied, proceed with regular interpolation
-    if (antialiasing_applied) {
-        // Antialiasing result is already in dst, we're done
-        return;
-    }
-
-    if (mode == GGML_SCALE_MODE_NEAREST) {
+    } else if (mode == GGML_SCALE_MODE_NEAREST) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
             for (int64_t i2 = ith; i2 < ne2; i2 += nth) {

From 7cf67d6cef1972e216491dd4b2f1995dc673fe55 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 28 Nov 2025 16:30:57 +0100
Subject: [PATCH 07/13] Cuda implementation of antialias kernel

---
 ggml/src/ggml-cpu/ops.cpp     |  8 ++--
 ggml/src/ggml-cuda/upscale.cu | 81 +++++++++++++++++++++++++++++++++--
 tests/test-backend-ops.cpp    |  2 +-
 3 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 0210956372d..531292d3d5d 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7425,10 +7425,10 @@ static void ggml_compute_forward_upscale_f32(
                         const float x = ((float) i0 + pixel_offset) / sf0;
 
                         // the range of source pixels that contribute
-                        const int64_t x_min = std::max(int64_t(0), (int64_t) (x - support0 + pixel_offset));
-                        const int64_t x_max = std::min(ne00, (int64_t) (x + support0 + pixel_offset));
-                        const int64_t y_min = std::max(int64_t(0), (int64_t) (y - support1 + pixel_offset));
-                        const int64_t y_max = std::min(ne01, (int64_t) (y + support1 + pixel_offset));
+                        const int64_t x_min = std::max<int64_t>(x - support0 + pixel_offset, 0);
+                        const int64_t x_max = std::min<int64_t>(x + support0 + pixel_offset, ne00);
+                        const int64_t y_min = std::max<int64_t>(y - support1 + pixel_offset, 0);
+                        const int64_t y_max = std::min<int64_t>(y + support1 + pixel_offset, ne01);
 
                         // bilinear filter with antialiasing
                         float val = 0.0f;
diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
index 687c669304d..9b9c6a2579a 100644
--- a/ggml/src/ggml-cuda/upscale.cu
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -81,6 +81,76 @@ static __global__ void upscale_f32_bilinear(const float * x, float * dst,
     dst[index] = result;
 }
 
+// Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
+// https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
+static __global__ void upscale_f32_bilinear_antialias(const float * src0, float * dst,
+        const int nb00, const int nb01, const int nb02, const int nb03,
+        const int ne00_src, const int ne01_src,
+        const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
+        const float sf0, const float sf1, const float sf2, const float sf3,
+        const float pixel_offset) {
+    const int64_t index              = threadIdx.x + blockIdx.x * blockDim.x;
+    const int64_t dst_total_elements = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
+
+    if (index >= dst_total_elements) {
+        return;
+    }
+
+    const int i10_dst = index % ne10_dst;
+    const int i11_dst = (index / ne10_dst) % ne11_dst;
+    const int i12_dst = (index / (ne10_dst * ne11_dst)) % ne12_dst;
+    const int i13_dst = index / (ne10_dst * ne11_dst * ne12_dst);
+
+    const int i02_src = (int)(i12_dst / sf2);
+    const int i03_src = (int)(i13_dst / sf3);
+
+    const float y = ((float)i11_dst + pixel_offset) / sf1;
+    const float x = ((float)i10_dst + pixel_offset) / sf0;
+
+    // support and invscale, maximum 1 pixel for bilinear
+    const float support1  = max(1.f / sf1, 1.f);
+    const float invscale1 = 1.0 / support1;
+    const float support0  = max(1.f / sf0, 1.f);
+    const float invscale0 = 1.f / support0;
+
+    // the range of source pixels that contribute
+    const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset));
+    const int64_t x_max = min(int64_t(ne00_src), int64_t(x + support0 + pixel_offset));
+    const int64_t y_min = max(int64_t(0), int64_t(y - support1 + pixel_offset));
+    const int64_t y_max = min(int64_t(ne01_src), int64_t(y + support1 + pixel_offset));
+
+    // bilinear filter with antialiasing
+    float val = 0.0f;
+    float total_weight = 0.0f;
+
+    auto triangle_filter = [](float x) -> float {
+        return max(1.0f - fabsf(x), 0.f);
+    };
+
+    for (int64_t sy = y_min; sy < y_max; sy++) {
+        const float weight_y = triangle_filter((sy - y + pixel_offset) * invscale1);
+
+        for (int64_t sx = x_min; sx < x_max; sx++) {
+            const float weight_x = triangle_filter((sx - x + pixel_offset) * invscale0);
+            const float weight = weight_x * weight_y;
+
+            if (weight <= 0.0f) {
+                continue;
+            }
+
+            const float pixel = *(const float *)((const char *)src0 + sx*nb00 + sy*nb01 + i02_src*nb02 + i03_src*nb03);
+            val += pixel * weight;
+            total_weight += weight;
+        }
+    }
+
+    if (total_weight > 0.0f) {
+        val /= total_weight;
+    }
+
+    dst[index] = val;
+}
+
 namespace bicubic_interpolation {
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
 __device__ const float a = -0.75f; // use alpha = -0.75 (same as PyTorch)
@@ -161,11 +231,15 @@ static void upscale_f32_bilinear_cuda(const float * x, float * dst,
         const int ne00_src, const int ne01_src,
         const int ne10_dst, const int ne11_dst, const int ne12_dst, const int ne13_dst,
         const float sf0, const float sf1, const float sf2, const float sf3,
-        const float pixel_offset, cudaStream_t stream) {
+        const float pixel_offset, bool antialias, cudaStream_t stream) {
     const int64_t dst_size   = ne10_dst * ne11_dst * ne12_dst * ne13_dst;
     const int64_t num_blocks = (dst_size + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
 
-    upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
+    if (antialias) {
+        upscale_f32_bilinear_antialias<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
+    } else {
+        upscale_f32_bilinear<<<num_blocks, CUDA_UPSCALE_BLOCK_SIZE,0,stream>>>(x, dst, nb00, nb01, nb02, nb03, ne00_src, ne01_src, ne10_dst, ne11_dst, ne12_dst, ne13_dst, sf0, sf1, sf2, sf3, pixel_offset);
+    }
 }
 
 static void upscale_f32_bicubic_cuda(const float * x, float * dst,
@@ -207,9 +281,10 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if (mode == GGML_SCALE_MODE_NEAREST) {
         upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
     } else if (mode == GGML_SCALE_MODE_BILINEAR) {
+        bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
         upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                                  src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-                                 sf0, sf1, sf2, sf3, pixel_offset, stream);
+                                 sf0, sf1, sf2, sf3, pixel_offset, antialias, stream);
     } else if (mode == GGML_SCALE_MODE_BICUBIC) {
         upscale_f32_bicubic_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                                  src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 87a61aa1224..9645d0b3909 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -7660,7 +7660,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     //    test_cases.emplace_back(new test_top_k(GGML_TYPE_F32, {i, 2, 1, 3}, rand() % i + 1));
     //}
 
-    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC}) {
+    for (ggml_scale_mode mode : {GGML_SCALE_MODE_NEAREST, GGML_SCALE_MODE_BILINEAR, GGML_SCALE_MODE_BICUBIC, ggml_scale_mode(GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ANTIALIAS)}) {
         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode));
         test_cases.emplace_back(new test_upscale(GGML_TYPE_F32, {512, 512, 3, 2}, 2, mode, true));
         test_cases.emplace_back(new test_interpolate(GGML_TYPE_F32, {2, 5,  7, 11}, {5, 7, 11, 13}, mode));

From 7c8b09868c6fce98fa3fa94f19ee0aca52080179 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 28 Nov 2025 16:32:25 +0100
Subject: [PATCH 08/13] Change placement in ops.cpp

---
 ggml/src/ggml-cpu/ops.cpp | 42 +++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 531292d3d5d..07f651f1a00 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7402,9 +7402,27 @@ static void ggml_compute_forward_upscale_f32(
         sf1 = ne1 > 1 && ne01 > 1 ? (float)(ne1 - 1) / (ne01 - 1) : sf1;
     }
 
-    // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
-    // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
-    if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
+    if (mode == GGML_SCALE_MODE_NEAREST) {
+        for (int64_t i3 = 0; i3 < ne3; i3++) {
+            const int64_t i03 = i3 / sf3;
+            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+                const int64_t i02 = i2 / sf2;
+                for (int64_t i1 = 0; i1 < ne1; i1++) {
+                    const int64_t i01 = i1 / sf1;
+                    for (int64_t i0 = 0; i0 < ne0; i0++) {
+                        const int64_t i00 = i0 / sf0;
+
+                        const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                              float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
+
+                        *y = *x;
+                    }
+                }
+            }
+        }
+    } else if (mode == GGML_SCALE_MODE_BILINEAR && (mode_flags & GGML_SCALE_FLAG_ANTIALIAS)) {
+        // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
+        // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
         auto triangle_filter = [](float x) -> float {
             return std::max(1.0f - fabsf(x), 0.f);
         };
@@ -7461,24 +7479,6 @@ static void ggml_compute_forward_upscale_f32(
                 }
             }
         }
-    } else if (mode == GGML_SCALE_MODE_NEAREST) {
-        for (int64_t i3 = 0; i3 < ne3; i3++) {
-            const int64_t i03 = i3 / sf3;
-            for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
-                const int64_t i02 = i2 / sf2;
-                for (int64_t i1 = 0; i1 < ne1; i1++) {
-                    const int64_t i01 = i1 / sf1;
-                    for (int64_t i0 = 0; i0 < ne0; i0++) {
-                        const int64_t i00 = i0 / sf0;
-
-                        const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
-                              float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
-
-                        *y = *x;
-                    }
-                }
-            }
-        }
     } else if (mode == GGML_SCALE_MODE_BILINEAR) {
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;

From 3ea706e94386b960237f3461ce9658c36e5530f4 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 28 Nov 2025 16:45:58 +0100
Subject: [PATCH 09/13] consistent float literals

---
 ggml/src/ggml-cpu/ops.cpp     | 10 +++++-----
 ggml/src/ggml-cuda/upscale.cu | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 07f651f1a00..32f937cad48 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7424,14 +7424,14 @@ static void ggml_compute_forward_upscale_f32(
         // Similar to F.interpolate(..., mode="bilinear", align_corners=False, antialias=True)
         // https://github.com/pytorch/pytorch/blob/8871ff29b743948d1225389d5b7068f37b22750b/aten/src/ATen/native/cpu/UpSampleKernel.cpp
         auto triangle_filter = [](float x) -> float {
-            return std::max(1.0f - fabsf(x), 0.f);
+            return std::max(1.0f - fabsf(x), 0.0f);
         };
 
         // support and invscale, maximum 1 pixel for bilinear
-        const float support1  = std::max(1.f, 1.f / sf1);
-        const float invscale1 = 1.0 / support1;
-        const float support0  = std::max(1.f, 1.f / sf0);
-        const float invscale0 = 1.f / support0;
+        const float support1  = std::max(1.0f, 1.0f / sf1);
+        const float invscale1 = 1.0f / support1;
+        const float support0  = std::max(1.0f, 1.0f / sf0);
+        const float invscale0 = 1.0f / support0;
 
         for (int64_t i3 = 0; i3 < ne3; i3++) {
             const int64_t i03 = i3 / sf3;
diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
index 9b9c6a2579a..7d4a617e3c9 100644
--- a/ggml/src/ggml-cuda/upscale.cu
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -108,10 +108,10 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float
     const float x = ((float)i10_dst + pixel_offset) / sf0;
 
     // support and invscale, maximum 1 pixel for bilinear
-    const float support1  = max(1.f / sf1, 1.f);
-    const float invscale1 = 1.0 / support1;
-    const float support0  = max(1.f / sf0, 1.f);
-    const float invscale0 = 1.f / support0;
+    const float support1  = max(1.0f / sf1, 1.0f);
+    const float invscale1 = 1.0f / support1;
+    const float support0  = max(1.0f / sf0, 1.0f);
+    const float invscale0 = 1.0f / support0;
 
     // the range of source pixels that contribute
     const int64_t x_min = max(int64_t(0), int64_t(x - support0 + pixel_offset));
@@ -124,7 +124,7 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float
     float total_weight = 0.0f;
 
     auto triangle_filter = [](float x) -> float {
-        return max(1.0f - fabsf(x), 0.f);
+        return max(1.0f - fabsf(x), 0.0f);
     };
 
     for (int64_t sy = y_min; sy < y_max; sy++) {

From 0b14906ab7b39039dbbc61de39e58a5fa3cd21e2 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Fri, 28 Nov 2025 17:03:12 +0100
Subject: [PATCH 10/13] Pad only for LFM2

---
 tools/mtmd/clip.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 06e469a626e..490d3ab0ec7 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -4355,9 +4355,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                     params.patch_size * params.n_merge,
                     params.image_min_pixels,
                     params.image_max_pixels);
+                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
 
                 clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, false);
+                const bool pad = (ctx->proj_type() != PROJECTOR_TYPE_LFM2);
+                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, pad, pad_color);
                 clip_image_f32_ptr res(clip_image_f32_init());
                 normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
                 res_imgs->entries.push_back(std::move(res));

From b81928f9795da0a7cb8af1592cb018a2e59f1ba6 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Sun, 30 Nov 2025 11:16:22 +0100
Subject: [PATCH 11/13] Address PR feedback

---
 ggml/src/ggml-cpu/ops.cpp     | 2 +-
 ggml/src/ggml-cuda/upscale.cu | 4 ++--
 ggml/src/ggml.c               | 2 ++
 tools/mtmd/clip.cpp           | 1 +
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 32f937cad48..608e82af69f 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -7427,7 +7427,7 @@ static void ggml_compute_forward_upscale_f32(
             return std::max(1.0f - fabsf(x), 0.0f);
         };
 
-        // support and invscale, maximum 1 pixel for bilinear
+        // support and invscale, minimum 1 pixel for bilinear
         const float support1  = std::max(1.0f, 1.0f / sf1);
         const float invscale1 = 1.0f / support1;
         const float support0  = std::max(1.0f, 1.0f / sf0);
diff --git a/ggml/src/ggml-cuda/upscale.cu b/ggml/src/ggml-cuda/upscale.cu
index 7d4a617e3c9..6bdf3cd996b 100644
--- a/ggml/src/ggml-cuda/upscale.cu
+++ b/ggml/src/ggml-cuda/upscale.cu
@@ -107,7 +107,7 @@ static __global__ void upscale_f32_bilinear_antialias(const float * src0, float
     const float y = ((float)i11_dst + pixel_offset) / sf1;
     const float x = ((float)i10_dst + pixel_offset) / sf0;
 
-    // support and invscale, maximum 1 pixel for bilinear
+    // support and invscale, minimum 1 pixel for bilinear
     const float support1  = max(1.0f / sf1, 1.0f);
     const float invscale1 = 1.0f / support1;
     const float support0  = max(1.0f / sf0, 1.0f);
@@ -281,7 +281,7 @@ void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     if (mode == GGML_SCALE_MODE_NEAREST) {
         upscale_f32_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], sf0, sf1, sf2, sf3, stream);
     } else if (mode == GGML_SCALE_MODE_BILINEAR) {
-        bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
+        const bool antialias = (mode_flags & GGML_SCALE_FLAG_ANTIALIAS);
         upscale_f32_bilinear_cuda(src0_d, dst_d, src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3],
                                  src0->ne[0], src0->ne[1], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
                                  sf0, sf1, sf2, sf3, pixel_offset, antialias, stream);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index b99345a2e93..17cf4d84bb8 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4891,6 +4891,8 @@ static struct ggml_tensor * ggml_interpolate_impl(
         int64_t               ne3,
         uint32_t              mode) {
     GGML_ASSERT((mode & 0xFF) < GGML_SCALE_MODE_COUNT);
+    // TODO: implement antialias for modes other than bilinear
+    GGML_ASSERT(!(mode & GGML_SCALE_FLAG_ANTIALIAS) || (mode & 0xFF) == GGML_SCALE_MODE_BILINEAR);
 
     struct ggml_tensor * result = ggml_new_tensor_4d(ctx, a->type, ne0, ne1, ne2, ne3);
 
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 490d3ab0ec7..db477bbbe9b 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2787,6 +2787,7 @@ struct clip_model_loader {
                     {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                         // ref: https://huggingface.co/LiquidAI/LFM2-VL-3B/blob/main/preprocessor_config.json
+                        // config above specifies number of tokens after downsampling, while here it is before, relax lowerbound to 64
                         hparams.set_limit_image_tokens(64, 1024);
                     } break;
                 case PROJECTOR_TYPE_PIXTRAL:

From 31be1a9fd11fadfa1d5df9bc0709d5f38ba40137 Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Sun, 30 Nov 2025 11:18:49 +0100
Subject: [PATCH 12/13] Rollback default marker placement changes

---
 tools/mtmd/mtmd-cli.cpp | 4 ++--
 tools/mtmd/mtmd.cpp     | 9 ---------
 tools/mtmd/mtmd.h       | 3 ---
 3 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index fff8714bfe3..6679de309b4 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -313,7 +313,7 @@ int main(int argc, char ** argv) {
         g_is_generating = true;
         if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
             for (size_t i = 0; i < params.image.size(); i++) {
-                params.prompt = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), params.prompt);
+                params.prompt += mtmd_default_marker();
             }
         }
         common_chat_msg msg;
@@ -378,7 +378,7 @@ int main(int argc, char ** argv) {
                 std::string media_path = line.substr(7);
                 if (ctx.load_media(media_path)) {
                     LOG("%s %s loaded\n", media_path.c_str(), is_image ? "image" : "audio");
-                    content = mtmd::mtmd_add_default_marker(ctx.ctx_vision.get(), content);
+                    content += mtmd_default_marker();
                 }
                 // else, error is already printed by libmtmd
                 continue;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index e616c464d05..6690bf30046 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1103,12 +1103,3 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) {
     g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default;
     g_logger_state.log_callback_user_data = user_data;
 }
-
-std::string mtmd::mtmd_add_default_marker(mtmd_context *ctx, const std::string &str) {
-    // for LFM2 image embeddings positioned before the text
-    if (ctx && ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_LFM2) {
-        return mtmd_default_marker() + str;
-    }
-
-    return str + mtmd_default_marker();
-}
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index d204bd594db..015119be897 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -299,9 +299,6 @@ struct input_chunks {
     }
 };
 
-// insert mtmd_default_marker() into given string, position depends on the projector
-std::string mtmd_add_default_marker(mtmd_context *ctx, const std::string &str);
-
 } // namespace mtmd
 
 #endif

From 2385ecf1b7b2952a81e335d2be80bba18d611c5d Mon Sep 17 00:00:00 2001
From: Tarek Dakhran <tarek@liquid.ai>
Date: Sun, 30 Nov 2025 11:27:12 +0100
Subject: [PATCH 13/13] Fallback to CPU implementation for antialias
 implementation of upscale

---
 ggml/src/ggml-cann/ggml-cann.cpp        | 3 +++
 ggml/src/ggml-metal/ggml-metal-device.m | 2 +-
 ggml/src/ggml-opencl/ggml-opencl.cpp    | 3 ++-
 ggml/src/ggml-sycl/ggml-sycl.cpp        | 2 +-
 ggml/src/ggml-vulkan/ggml-vulkan.cpp    | 1 +
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp
index df28d67fb0b..cd1b5e5b944 100644
--- a/ggml/src/ggml-cann/ggml-cann.cpp
+++ b/ggml/src/ggml-cann/ggml-cann.cpp
@@ -2500,6 +2500,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, const ggml_ten
                 if (op->op_params[0] != GGML_SCALE_MODE_NEAREST) {
                     return false;
                 }
+                if (op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS) {
+                    return false;
+                }
                 return true;
             }
         case GGML_OP_POOL_2D:
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m
index 09b1b503118..3aad16a3ff7 100644
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -894,7 +894,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
         case GGML_OP_POOL_1D:
             return false;
         case GGML_OP_UPSCALE:
-            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
+            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
         case GGML_OP_POOL_2D:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_PAD:
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp
index e5302f4550e..277a30d30ed 100644
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -3086,8 +3086,9 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32;
         case GGML_OP_UPSCALE: {
             ggml_scale_mode mode = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & 0xFF);
+            const bool antialias = (ggml_scale_mode)(ggml_get_op_params_i32(op, 0) & GGML_SCALE_FLAG_ANTIALIAS);
             return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 &&
-                   (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR);
+                   (mode == GGML_SCALE_MODE_NEAREST || mode == GGML_SCALE_MODE_BILINEAR) && !antialias;
         }
         case GGML_OP_CONV_2D:
             return (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16 && op->type == GGML_TYPE_F16) ||
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 3f1bdfb9f1b..e82b51206e2 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -4597,7 +4597,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_IM2COL:
             return true;
         case GGML_OP_UPSCALE:
-            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST;
+            return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
         case GGML_OP_MEAN:
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 66dd0bfabd2..95966ce1d8e 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -14113,6 +14113,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
             }
             return true;
         case GGML_OP_UPSCALE:
+            return op->src[0]->type == GGML_TYPE_F32 && !(op->op_params[0] & GGML_SCALE_FLAG_ANTIALIAS);
         case GGML_OP_ACC:
             return op->src[0]->type == GGML_TYPE_F32;
         case GGML_OP_CONCAT: