diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 66f70bf438af..56ac02ebbfa9 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -165,10 +165,6 @@ class Glm4vTextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - image_token_id (`int`, *optional*): - Token index used as placeholder for image embeddings. - video_token_id (`int`, *optional*): - Token index used as placeholder for video embeddings. ```python >>> from transformers import Glm4vTextModel, Glm4vConfig @@ -217,8 +213,6 @@ def __init__( tie_word_embeddings: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - image_token_id: Optional[int] = None, - video_token_id: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -246,8 +240,6 @@ def __init__( rope_theta = kwargs.get("rope_theta", 10000.0) standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) - self.image_token_id = image_token_id - self.video_token_id = video_token_id super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index dce3ef92c996..eecc4f4ab138 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -202,10 +202,6 @@ class Glm4vTextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - image_token_id (`int`, *optional*): - Token index used as placeholder for image embeddings. - video_token_id (`int`, *optional*): - Token index used as placeholder for video embeddings. ```python >>> from transformers import Glm4vTextModel, Glm4vConfig @@ -254,8 +250,6 @@ def __init__( tie_word_embeddings: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - image_token_id: Optional[int] = None, - video_token_id: Optional[int] = None, **kwargs, ): self.vocab_size = vocab_size @@ -283,8 +277,6 @@ def __init__( rope_theta = kwargs.get("rope_theta", 10000.0) standardize_rope_params(self, rope_theta=rope_theta) rope_config_validation(self, ignore_keys={"mrope_section"}) - self.image_token_id = image_token_id - self.video_token_id = video_token_id super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)