From 7d0c853da2d8163848762e6f8b0687e963acfd4f Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Fri, 21 Nov 2025 17:01:25 +0000 Subject: [PATCH 1/3] is causal --- backends/candle/src/models/flash_qwen2.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/candle/src/models/flash_qwen2.rs b/backends/candle/src/models/flash_qwen2.rs index c9116311a..b6f0cb8f0 100644 --- a/backends/candle/src/models/flash_qwen2.rs +++ b/backends/candle/src/models/flash_qwen2.rs @@ -111,7 +111,7 @@ impl Qwen2Attention { max_s, max_s, self.softmax_scale, - false, + true, None, None, )?; From e64b1d89b4fb1f1db87ac35155ad24b04fca527d Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Fri, 21 Nov 2025 17:16:55 +0000 Subject: [PATCH 2/3] support qwen2 flash --- backends/candle/src/models/flash_qwen2.rs | 4 +++- backends/candle/src/models/qwen2.rs | 8 ++++++++ router/src/lib.rs | 8 ++++++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/backends/candle/src/models/flash_qwen2.rs b/backends/candle/src/models/flash_qwen2.rs index b6f0cb8f0..f94e26ae9 100644 --- a/backends/candle/src/models/flash_qwen2.rs +++ b/backends/candle/src/models/flash_qwen2.rs @@ -15,6 +15,7 @@ struct Qwen2Attention { attention_head_size: usize, softmax_scale: f32, + is_causal: bool, span: tracing::Span, } @@ -66,6 +67,7 @@ impl Qwen2Attention { num_key_value_heads, attention_head_size, softmax_scale, + is_causal: config.is_causal, span: tracing::span!(tracing::Level::TRACE, "attention"), }) } @@ -111,7 +113,7 @@ impl Qwen2Attention { max_s, max_s, self.softmax_scale, - true, + self.is_causal, None, None, )?; diff --git a/backends/candle/src/models/qwen2.rs b/backends/candle/src/models/qwen2.rs index 42559b875..f7855cd96 100644 --- a/backends/candle/src/models/qwen2.rs +++ b/backends/candle/src/models/qwen2.rs @@ -1,5 +1,11 @@ use crate::layers::HiddenAct; use serde::Deserialize; +use tracing; + +fn default_is_causal() -> bool { + tracing::warn!("is_causal not set in Qwen2Config, defaulting to true. e.g. Alibaba-NLP/gte-Qwen2-1.5B-instruct/ was trained with causal=False attention, but jinaai/jina-code-embeddings-0.5b with causal=True. Please set this field explicitly in the huggingface repo to avoid this warning."); + true +} #[derive(Debug, Clone, PartialEq, Deserialize)] pub struct Qwen2Config { @@ -15,4 +21,6 @@ pub struct Qwen2Config { pub rope_theta: f32, pub sliding_window: Option, pub use_sliding_window: bool, + #[serde(default = "default_is_causal")] + pub is_causal: bool, } diff --git a/router/src/lib.rs b/router/src/lib.rs index d83bd95c5..5a2361736 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -141,9 +141,12 @@ pub async fn run( "tokenizer.json not found. text-embeddings-inference only supports fast tokenizers", ); tokenizer.with_padding(None); - // Qwen2 updates the post processor manually instead of into the tokenizer.json... + // Old Qwen2 repos updates the post processor manually instead of into the tokenizer.json. + // Newer ones (https://huggingface.co/jinaai/jina-code-embeddings-0.5b/tree/main) have it in the tokenizer.json. This is to support both cases. // https://huggingface.co/Alibaba-NLP/gte-Qwen2-1.5B-instruct/blob/main/tokenization_qwen.py#L246 - if config.model_type == "qwen2" { + if config.model_type == "qwen2" && config.auto_map.as_ref().map_or(false, |m| { + m.get("AutoModel") == Some(&"modeling_qwen.Qwen2Model".to_string()) + }) { let template = TemplateProcessing::builder() .try_single("$A:0 <|endoftext|>:0") .unwrap() @@ -449,6 +452,7 @@ pub struct ModelConfig { pub pad_token_id: usize, pub id2label: Option>, pub label2id: Option>, + pub auto_map: Option>, } #[derive(Debug, Clone, PartialEq, Deserialize)] From 690b43ae32faadd5f85044e347777ec86fcb3431 Mon Sep 17 00:00:00 2001 From: michaelfeil Date: Fri, 21 Nov 2025 17:35:55 +0000 Subject: [PATCH 3/3] add warning --- router/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/router/src/lib.rs b/router/src/lib.rs index 5a2361736..a9837f1b2 100644 --- a/router/src/lib.rs +++ b/router/src/lib.rs @@ -147,6 +147,7 @@ pub async fn run( if config.model_type == "qwen2" && config.auto_map.as_ref().map_or(false, |m| { m.get("AutoModel") == Some(&"modeling_qwen.Qwen2Model".to_string()) }) { + tracing::warn!("Model is detected as a Qwen2 model with remote code. Adding a post processor manually as the tokenizer.json does not contain a post processor."); let template = TemplateProcessing::builder() .try_single("$A:0 <|endoftext|>:0") .unwrap()