diff --git a/vllm/model_executor/models/flex_olmo.py b/vllm/model_executor/models/flex_olmo.py index 11d0949a798a..1ee33f83b7c9 100644 --- a/vllm/model_executor/models/flex_olmo.py +++ b/vllm/model_executor/models/flex_olmo.py @@ -16,6 +16,7 @@ import torch from torch import nn +from transformers import FlexOlmoConfig from vllm.config import VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size @@ -24,7 +25,6 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.models.olmoe import OlmoeAttention, OlmoeForCausalLM -from vllm.transformers_utils.configs import FlexOlmoConfig logger = init_logger(__name__) diff --git a/vllm/model_executor/models/olmo2.py b/vllm/model_executor/models/olmo2.py index 88e9c2d8541a..a5e92fbc296e 100644 --- a/vllm/model_executor/models/olmo2.py +++ b/vllm/model_executor/models/olmo2.py @@ -30,7 +30,7 @@ import torch from torch import nn -from transformers import Olmo2Config +from transformers import Olmo2Config, Olmo3Config from vllm.attention.layer import Attention from vllm.compilation.decorators import support_torch_compile @@ -63,7 +63,6 @@ maybe_prefix, ) from vllm.sequence import IntermediateTensors -from vllm.transformers_utils.configs import Olmo3Config class Olmo2Attention(nn.Module): diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 66680f410cb3..c122f3d9f048 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -88,7 +88,6 @@ def __getitem__(self, key): chatglm="ChatGLMConfig", deepseek_vl_v2="DeepseekVLV2Config", deepseek_v32="DeepseekV3Config", - flex_olmo="FlexOlmoConfig", hunyuan_vl="HunYuanVLConfig", kimi_linear="KimiLinearConfig", kimi_vl="KimiVLConfig", @@ -101,7 +100,6 @@ def __getitem__(self, key): eagle="EAGLEConfig", speculators="SpeculatorsConfig", nemotron="NemotronConfig", - olmo3="Olmo3Config", ovis="OvisConfig", ultravox="UltravoxConfig", step3_vl="Step3VLConfig", diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 109f2b698651..aa1a9077af77 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -22,7 +22,6 @@ # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the # `FalconConfig` class from the official HuggingFace transformers library. from vllm.transformers_utils.configs.falcon import RWConfig -from vllm.transformers_utils.configs.flex_olmo import FlexOlmoConfig from vllm.transformers_utils.configs.hunyuan_vl import ( HunYuanVLConfig, HunYuanVLTextConfig, @@ -38,7 +37,6 @@ from vllm.transformers_utils.configs.moonvit import MoonViTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig from vllm.transformers_utils.configs.nemotron_h import NemotronHConfig -from vllm.transformers_utils.configs.olmo3 import Olmo3Config from vllm.transformers_utils.configs.ovis import OvisConfig from vllm.transformers_utils.configs.qwen3_next import Qwen3NextConfig from vllm.transformers_utils.configs.radio import RadioConfig @@ -57,7 +55,6 @@ "DeepseekV3Config", "DotsOCRConfig", "EAGLEConfig", - "FlexOlmoConfig", "HunYuanVLConfig", "HunYuanVLTextConfig", "HunYuanVLVisionConfig", @@ -72,7 +69,6 @@ "KimiVLConfig", "NemotronConfig", "NemotronHConfig", - "Olmo3Config", "OvisConfig", "RadioConfig", "SpeculatorsConfig", diff --git a/vllm/transformers_utils/configs/flex_olmo.py b/vllm/transformers_utils/configs/flex_olmo.py deleted file mode 100644 index c343dc0999a8..000000000000 --- a/vllm/transformers_utils/configs/flex_olmo.py +++ /dev/null @@ -1,82 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from typing import Any - -from transformers.configuration_utils import PretrainedConfig - - -class FlexOlmoConfig(PretrainedConfig): - model_type = "flex_olmo" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=100352, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=4096, - initializer_range=0.02, - rms_norm_eps=1e-06, - use_cache=True, - pad_token_id=100277, - bos_token_id=None, - eos_token_id=100257, - tie_word_embeddings=False, - rope_parameters: dict[str, Any] | None = None, - attention_bias=False, - attention_dropout=0.0, - num_experts_per_tok=5, - num_experts=7, - output_router_logits=False, - router_aux_loss_coef=0.01, - norm_topk_prob=False, - **kwargs, - ): - if "architectures" not in kwargs: - kwargs["architectures"] = ["FlexOlmoForCausalLM"] - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.rms_norm_eps = rms_norm_eps - self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} - rope_theta = kwargs.pop("rope_theta", 500000.0) - if "rope_theta" not in rope_parameters: - rope_parameters["rope_theta"] = rope_theta - self.rope_parameters = rope_parameters - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - self.num_experts_per_tok = num_experts_per_tok - self.num_experts = num_experts - self.output_router_logits = output_router_logits - self.router_aux_loss_coef = router_aux_loss_coef - self.norm_topk_prob = norm_topk_prob - # Validate the correctness of rotary position embeddings parameters - # BC: if there is a 'type' field, move it to 'rope_type'. - if self.rope_parameters is not None and "type" in self.rope_parameters: - self.rope_parameters["rope_type"] = self.rope_parameters["type"] diff --git a/vllm/transformers_utils/configs/olmo3.py b/vllm/transformers_utils/configs/olmo3.py deleted file mode 100644 index c4691b661af3..000000000000 --- a/vllm/transformers_utils/configs/olmo3.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project - -from transformers.configuration_utils import PretrainedConfig - - -class Olmo3Config(PretrainedConfig): - model_type = "olmo3" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vocab_size=50304, - hidden_size=4096, - intermediate_size=11008, - num_hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=None, - hidden_act="silu", - max_position_embeddings=2048, - initializer_range=0.02, - use_cache=True, - pad_token_id=1, - bos_token_id=None, - eos_token_id=50279, - tie_word_embeddings=False, - rope_parameters=None, - attention_bias=False, - attention_dropout=0.0, - rms_norm_eps=1e-5, - sliding_window=4096, - layer_types=None, - **kwargs, - ): - # This model uses Olmo3ForCausalLM in transformers but Olmo2ForCausalLM - # in vLLM. - if "architectures" not in kwargs: - kwargs["architectures"] = ["Olmo2ForCausalLM"] - elif "Olmo3ForCausalLM" in kwargs["architectures"]: - kwargs["architectures"].remove("Olmo3ForCausalLM") - kwargs["architectures"].append("Olmo2ForCausalLM") - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - self.vocab_size = vocab_size - self.max_position_embeddings = max_position_embeddings - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - - # for backward compatibility - if num_key_value_heads is None: - num_key_value_heads = num_attention_heads - - self.num_key_value_heads = num_key_value_heads - self.hidden_act = hidden_act - self.initializer_range = initializer_range - self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters or {"rope_type": "default"} - rope_theta = kwargs.pop("rope_theta", 10000.0) - if "rope_theta" not in rope_parameters: - rope_parameters["rope_theta"] = rope_theta - self.rope_parameters = rope_parameters - self.attention_bias = attention_bias - self.attention_dropout = attention_dropout - - self.rms_norm_eps = rms_norm_eps - - self.sliding_window = sliding_window - self.layer_types = layer_types - if self.layer_types is None: - self.layer_types = [ - "sliding_attention" if (i + 1) % 4 != 0 else "full_attention" - for i in range(self.num_hidden_layers) - ]