From 7d0c853da2d8163848762e6f8b0687e963acfd4f Mon Sep 17 00:00:00 2001
From: michaelfeil <mail@michaelfeil.eu>
Date: Fri, 21 Nov 2025 17:01:25 +0000
Subject: [PATCH 1/3] is causal

---
 backends/candle/src/models/flash_qwen2.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/backends/candle/src/models/flash_qwen2.rs b/backends/candle/src/models/flash_qwen2.rs
index c9116311a..b6f0cb8f0 100644
--- a/backends/candle/src/models/flash_qwen2.rs
+++ b/backends/candle/src/models/flash_qwen2.rs
@@ -111,7 +111,7 @@ impl Qwen2Attention {
             max_s,
             max_s,
             self.softmax_scale,
-            false,
+            true,
             None,
             None,
         )?;

From e64b1d89b4fb1f1db87ac35155ad24b04fca527d Mon Sep 17 00:00:00 2001
From: michaelfeil <mail@michaelfeil.eu>
Date: Fri, 21 Nov 2025 17:16:55 +0000
Subject: [PATCH 2/3] support qwen2 flash

---
 backends/candle/src/models/flash_qwen2.rs | 4 +++-
 backends/candle/src/models/qwen2.rs       | 8 ++++++++
 router/src/lib.rs                         | 8 ++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/backends/candle/src/models/flash_qwen2.rs b/backends/candle/src/models/flash_qwen2.rs
index b6f0cb8f0..f94e26ae9 100644
--- a/backends/candle/src/models/flash_qwen2.rs
+++ b/backends/candle/src/models/flash_qwen2.rs
@@ -15,6 +15,7 @@ struct Qwen2Attention {
     attention_head_size: usize,
 
     softmax_scale: f32,
+    is_causal: bool,
 
     span: tracing::Span,
 }
@@ -66,6 +67,7 @@ impl Qwen2Attention {
             num_key_value_heads,
             attention_head_size,
             softmax_scale,
+            is_causal: config.is_causal,
             span: tracing::span!(tracing::Level::TRACE, "attention"),
         })
     }
@@ -111,7 +113,7 @@ impl Qwen2Attention {
             max_s,
             max_s,
             self.softmax_scale,
-            true,
+            self.is_causal,
             None,
             None,
         )?;
diff --git a/backends/candle/src/models/qwen2.rs b/backends/candle/src/models/qwen2.rs
index 42559b875..f7855cd96 100644
--- a/backends/candle/src/models/qwen2.rs
+++ b/backends/candle/src/models/qwen2.rs
@@ -1,5 +1,11 @@
 use crate::layers::HiddenAct;
 use serde::Deserialize;
+use tracing;
+
+fn default_is_causal() -> bool {
+    tracing::warn!("is_causal not set in Qwen2Config, defaulting to true. e.g. Alibaba-NLP/gte-Qwen2-1.5B-instruct/ was trained with causal=False attention, but jinaai/jina-code-embeddings-0.5b with causal=True. Please set this field explicitly in the huggingface repo to avoid this warning.");
+    true
+}
 
 #[derive(Debug, Clone, PartialEq, Deserialize)]
 pub struct Qwen2Config {
@@ -15,4 +21,6 @@ pub struct Qwen2Config {
     pub rope_theta: f32,
     pub sliding_window: Option<usize>,
     pub use_sliding_window: bool,
+    #[serde(default = "default_is_causal")]
+    pub is_causal: bool,
 }
diff --git a/router/src/lib.rs b/router/src/lib.rs
index d83bd95c5..5a2361736 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -141,9 +141,12 @@ pub async fn run(
         "tokenizer.json not found. text-embeddings-inference only supports fast tokenizers",
     );
     tokenizer.with_padding(None);
-    // Qwen2 updates the post processor manually instead of into the tokenizer.json...
+    // Old Qwen2  repos updates the post processor manually instead of into the tokenizer.json.
+    // Newer ones (https://huggingface.co/jinaai/jina-code-embeddings-0.5b/tree/main) have it in the tokenizer.json. This is to support both cases.
     // https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/blob/main/tokenization_qwen.py#L246
-    if config.model_type == "qwen2" {
+    if config.model_type == "qwen2" && config.auto_map.as_ref().map_or(false, |m| {
+        m.get("AutoModel") == Some(&"modeling_qwen.Qwen2Model".to_string())
+    }) {
         let template = TemplateProcessing::builder()
             .try_single("$A:0 <|endoftext|>:0")
             .unwrap()
@@ -449,6 +452,7 @@ pub struct ModelConfig {
     pub pad_token_id: usize,
     pub id2label: Option<HashMap<String, String>>,
     pub label2id: Option<HashMap<String, usize>>,
+    pub auto_map: Option<HashMap<String, String>>,
 }
 
 #[derive(Debug, Clone, PartialEq, Deserialize)]

From 690b43ae32faadd5f85044e347777ec86fcb3431 Mon Sep 17 00:00:00 2001
From: michaelfeil <mail@michaelfeil.eu>
Date: Fri, 21 Nov 2025 17:35:55 +0000
Subject: [PATCH 3/3] add warning

---
 router/src/lib.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/router/src/lib.rs b/router/src/lib.rs
index 5a2361736..a9837f1b2 100644
--- a/router/src/lib.rs
+++ b/router/src/lib.rs
@@ -147,6 +147,7 @@ pub async fn run(
     if config.model_type == "qwen2" && config.auto_map.as_ref().map_or(false, |m| {
         m.get("AutoModel") == Some(&"modeling_qwen.Qwen2Model".to_string())
     }) {
+        tracing::warn!("Model is detected as a Qwen2 model with remote code. Adding a post processor manually as the tokenizer.json does not contain a post processor.");
         let template = TemplateProcessing::builder()
             .try_single("$A:0 <|endoftext|>:0")
             .unwrap()