From 967cb1d7d2e7b2c01f15fa894cdf9eb39ca79b9e Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 25 Sep 2025 17:55:48 +0200 Subject: [PATCH 1/5] remove from modeling files --- .../models/albert/modeling_albert.py | 49 ++---------- .../models/altclip/modeling_altclip.py | 40 ++-------- src/transformers/models/bert/modeling_bert.py | 73 +++--------------- .../modeling_bert_generation.py | 66 ++-------------- .../models/big_bird/modeling_big_bird.py | 1 - .../bridgetower/modeling_bridgetower.py | 76 +++--------------- .../models/camembert/modeling_camembert.py | 77 +++---------------- src/transformers/models/clap/modeling_clap.py | 9 +-- .../models/data2vec/modeling_data2vec_text.py | 77 +++---------------- .../models/electra/modeling_electra.py | 75 +++--------------- .../models/ernie/modeling_ernie.py | 76 ++---------------- .../models/ernie/modular_ernie.py | 7 +- .../models/roberta/modeling_roberta.py | 77 +++---------------- .../models/roberta/modular_roberta.py | 8 +- .../modeling_roberta_prelayernorm.py | 71 +++-------------- .../models/roc_bert/modeling_roc_bert.py | 77 +++---------------- .../xlm_roberta/modeling_xlm_roberta.py | 77 +++---------------- .../xlm_roberta_xl/modeling_xlm_roberta_xl.py | 76 ++---------------- .../xlm_roberta_xl/modular_xlm_roberta_xl.py | 11 ++- src/transformers/models/xmod/modeling_xmod.py | 71 +++-------------- tests/models/albert/test_modeling_albert.py | 7 -- tests/models/bert/test_modeling_bert.py | 47 ----------- tests/models/roberta/test_modeling_roberta.py | 13 ---- 23 files changed, 139 insertions(+), 1022 deletions(-) diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 31caa335bb64..8384630701ab 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -69,7 +69,6 @@ def __init__(self, config: AlbertConfig): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -106,11 +105,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -126,38 +125,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -202,11 +176,6 @@ def __init__(self, config: AlbertConfig): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pruned_heads = set() - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - self.is_causal = False def prune_heads(self, heads: list[int]) -> None: @@ -244,11 +213,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -260,8 +224,6 @@ def forward( dropout=0.0 if not self.training else self.attention_dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=False, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -444,7 +406,6 @@ def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True): self.pooler_activation = None self.attn_implementation = config._attn_implementation - self.position_embedding_type = config.position_embedding_type # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/altclip/modeling_altclip.py b/src/transformers/models/altclip/modeling_altclip.py index ec8031507d50..c3ea4e445a50 100755 --- a/src/transformers/models/altclip/modeling_altclip.py +++ b/src/transformers/models/altclip/modeling_altclip.py @@ -100,7 +100,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -152,11 +151,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -197,7 +196,7 @@ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_l class AltRobertaSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -214,12 +213,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) def forward( self, @@ -238,23 +231,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in AltRobertaModel forward() function) @@ -303,11 +279,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class AltRobertaAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = ALT_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, position_embedding_type=position_embedding_type - ) + self.self = ALT_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation](config) self.output = AltRobertaSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 384e34351ea7..54db194baf46 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -67,7 +67,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -108,11 +107,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -127,38 +126,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -177,7 +151,7 @@ def eager_attention_forward( class BertSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -196,12 +170,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -240,11 +208,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -256,8 +219,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -265,7 +226,7 @@ def forward( class BertCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -284,12 +245,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -332,11 +287,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -348,8 +298,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -372,13 +320,13 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class BertAttention(nn.Module): def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False + self, config, is_causal=False, layer_idx=None, is_cross_attention=False ): super().__init__() self.is_cross_attention = is_cross_attention attention_class = BertCrossAttention if is_cross_attention else BertSelfAttention self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx + config, is_causal=is_causal, layer_idx=layer_idx ) self.output = BertSelfOutput(config) self.pruned_heads = set() @@ -468,7 +416,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = BertAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -737,8 +684,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = BertPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 8966adc1eb26..3f2a9fb68c32 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -71,38 +71,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -122,7 +97,7 @@ def eager_attention_forward( # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->BertGeneration class BertGenerationSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -141,12 +116,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -185,11 +154,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -201,8 +165,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -211,7 +173,7 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertCrossAttention with Bert->BertGeneration class BertGenerationCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -230,12 +192,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -278,11 +234,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -294,8 +245,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -304,15 +253,11 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration,BERT->BERT_GENERATION class BertGenerationAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = BertGenerationCrossAttention if is_cross_attention else BertGenerationSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = BertGenerationSelfOutput(config) self.pruned_heads = set() @@ -404,7 +349,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = BertGenerationAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py index 6658235c2e03..7fbb262b0bef 100755 --- a/src/transformers/models/big_bird/modeling_big_bird.py +++ b/src/transformers/models/big_bird/modeling_big_bird.py @@ -78,7 +78,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) diff --git a/src/transformers/models/bridgetower/modeling_bridgetower.py b/src/transformers/models/bridgetower/modeling_bridgetower.py index ff88a0a087d1..1a8bb006cdb3 100644 --- a/src/transformers/models/bridgetower/modeling_bridgetower.py +++ b/src/transformers/models/bridgetower/modeling_bridgetower.py @@ -416,38 +416,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -467,7 +442,7 @@ def eager_attention_forward( # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->BridgeTower class BridgeTowerSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -486,12 +461,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -530,11 +499,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -546,8 +510,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -556,7 +518,7 @@ def forward( # Copied from transformers.models.roberta.modeling_roberta.RobertaCrossAttention with Roberta->BridgeTower class BridgeTowerCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -575,12 +537,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -623,11 +579,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -639,8 +590,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -649,15 +598,11 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower,BERT->BRIDGE_TOWER class BridgeTowerAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = BridgeTowerCrossAttention if is_cross_attention else BridgeTowerSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = BridgeTowerSelfOutput(config) self.pruned_heads = set() @@ -714,7 +659,6 @@ def __init__(self, config, layer_idx=None): self.add_cross_attention = config.add_cross_attention self.crossattention = BridgeTowerAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -780,7 +724,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = BridgeTowerAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -915,7 +858,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -967,11 +909,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py index dd882e60096f..3565e5faaafe 100644 --- a/src/transformers/models/camembert/modeling_camembert.py +++ b/src/transformers/models/camembert/modeling_camembert.py @@ -66,38 +66,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -116,7 +91,7 @@ def eager_attention_forward( class CamembertSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -135,12 +110,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -179,11 +148,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -195,8 +159,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -204,7 +166,7 @@ def forward( class CamembertCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -223,12 +185,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -271,11 +227,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -287,8 +238,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -310,15 +259,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class CamembertAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = CamembertCrossAttention if is_cross_attention else CamembertSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = CamembertSelfOutput(config) self.pruned_heads = set() @@ -407,7 +352,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = CamembertAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -539,7 +483,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -591,11 +534,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -717,8 +660,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = CamembertPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py index 33ad9463ff24..4283fec6978d 100644 --- a/src/transformers/models/clap/modeling_clap.py +++ b/src/transformers/models/clap/modeling_clap.py @@ -990,7 +990,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=True ) @@ -1042,11 +1041,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py index 0cba1f894003..3cb501611998 100644 --- a/src/transformers/models/data2vec/modeling_data2vec_text.py +++ b/src/transformers/models/data2vec/modeling_data2vec_text.py @@ -67,7 +67,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -119,11 +118,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -172,38 +171,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -222,7 +196,7 @@ def eager_attention_forward( class Data2VecTextSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -241,12 +215,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -285,11 +253,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -301,8 +264,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -310,7 +271,7 @@ def forward( class Data2VecTextCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -329,12 +290,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -377,11 +332,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -393,8 +343,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -416,15 +364,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class Data2VecTextAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = Data2VecTextCrossAttention if is_cross_attention else Data2VecTextSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = Data2VecTextSelfOutput(config) self.pruned_heads = set() @@ -513,7 +457,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = Data2VecTextAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -677,8 +620,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = Data2VecTextPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 100e48034abb..e1d0591f7193 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -74,7 +74,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False ) @@ -113,11 +112,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -133,38 +132,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -184,7 +158,7 @@ def eager_attention_forward( # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Electra class ElectraSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -203,12 +177,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -247,11 +215,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -263,8 +226,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -273,7 +234,7 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertCrossAttention with Bert->Electra class ElectraCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -292,12 +253,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -340,11 +295,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -356,8 +306,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -381,15 +329,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Electra,BERT->ELECTRA class ElectraAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = ElectraCrossAttention if is_cross_attention else ElectraSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = ElectraSelfOutput(config) self.pruned_heads = set() @@ -481,7 +425,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = ElectraAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, diff --git a/src/transformers/models/ernie/modeling_ernie.py b/src/transformers/models/ernie/modeling_ernie.py index 3e94cf71d1e6..56c9a731153a 100644 --- a/src/transformers/models/ernie/modeling_ernie.py +++ b/src/transformers/models/ernie/modeling_ernie.py @@ -71,7 +71,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -117,11 +116,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings # add `task_type_id` for ERNIE model if self.use_task_id: @@ -144,38 +142,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -194,7 +167,7 @@ def eager_attention_forward( class ErnieSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -213,12 +186,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -257,11 +224,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -273,8 +235,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -282,7 +242,7 @@ def forward( class ErnieCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -301,12 +261,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -349,11 +303,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -365,8 +314,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -388,15 +335,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class ErnieAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = ErnieCrossAttention if is_cross_attention else ErnieSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = ErnieSelfOutput(config) self.pruned_heads = set() @@ -485,7 +428,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = ErnieAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -699,8 +641,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = ErniePooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/ernie/modular_ernie.py b/src/transformers/models/ernie/modular_ernie.py index 30261966b3d0..cb48f1f258b5 100644 --- a/src/transformers/models/ernie/modular_ernie.py +++ b/src/transformers/models/ernie/modular_ernie.py @@ -111,11 +111,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings # add `task_type_id` for ERNIE model if self.use_task_id: diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index 8810be00a0d0..17060b4c1595 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -68,7 +68,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -120,11 +119,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -173,38 +172,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -223,7 +197,7 @@ def eager_attention_forward( class RobertaSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -242,12 +216,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -286,11 +254,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -302,8 +265,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -311,7 +272,7 @@ def forward( class RobertaCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -330,12 +291,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -378,11 +333,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -394,8 +344,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -417,15 +365,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class RobertaAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = RobertaCrossAttention if is_cross_attention else RobertaSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = RobertaSelfOutput(config) self.pruned_heads = set() @@ -514,7 +458,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = RobertaAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -687,8 +630,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = RobertaPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/roberta/modular_roberta.py b/src/transformers/models/roberta/modular_roberta.py index e98eddf99bf5..79d8368f2e65 100644 --- a/src/transformers/models/roberta/modular_roberta.py +++ b/src/transformers/models/roberta/modular_roberta.py @@ -93,11 +93,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 0085992d2a9a..169961fc77d5 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -64,7 +64,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -116,11 +115,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -170,38 +169,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -221,7 +195,7 @@ def eager_attention_forward( # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RobertaPreLayerNorm class RobertaPreLayerNormSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -240,12 +214,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -284,11 +252,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -300,8 +263,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -310,7 +271,7 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertCrossAttention with Bert->RobertaPreLayerNorm class RobertaPreLayerNormCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -329,12 +290,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -377,11 +332,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -393,8 +343,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -416,13 +364,13 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class RobertaPreLayerNormAttention(nn.Module): def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False + self, config, is_causal=False, layer_idx=None, is_cross_attention=False ): super().__init__() self.is_cross_attention = is_cross_attention attention_class = RobertaPreLayerNormCrossAttention if is_cross_attention else RobertaPreLayerNormSelfAttention self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx + config, is_causal=is_causal, layer_idx=layer_idx ) self.output = RobertaPreLayerNormSelfOutput(config) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @@ -517,7 +465,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = RobertaPreLayerNormAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, diff --git a/src/transformers/models/roc_bert/modeling_roc_bert.py b/src/transformers/models/roc_bert/modeling_roc_bert.py index d7614e59c2d6..6098e9246406 100644 --- a/src/transformers/models/roc_bert/modeling_roc_bert.py +++ b/src/transformers/models/roc_bert/modeling_roc_bert.py @@ -88,7 +88,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -132,9 +131,8 @@ def forward( inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) @@ -172,9 +170,8 @@ def forward( token_type_embeddings = self.token_type_embeddings(token_type_ids) embedding_in += token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embedding_in += position_embeddings + position_embeddings = self.position_embeddings(position_ids) + embedding_in += position_embeddings embedding_in = self.LayerNorm(embedding_in) embedding_in = self.dropout(embedding_in) @@ -191,38 +188,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -242,7 +214,7 @@ def eager_attention_forward( # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->RoCBert class RoCBertSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -261,12 +233,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -305,11 +271,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -321,8 +282,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -331,7 +290,7 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertCrossAttention with Bert->RoCBert class RoCBertCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -350,12 +309,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -398,11 +351,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -414,8 +362,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -439,15 +385,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->RoCBert,BERT->ROC_BERT class RoCBertAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = RoCBertCrossAttention if is_cross_attention else RoCBertSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = RoCBertSelfOutput(config) self.pruned_heads = set() @@ -539,7 +481,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = RoCBertAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, diff --git a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py index 60b14e35f781..b94e2a616904 100644 --- a/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/modeling_xlm_roberta.py @@ -66,38 +66,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -116,7 +91,7 @@ def eager_attention_forward( class XLMRobertaSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -135,12 +110,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -179,11 +148,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -195,8 +159,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -204,7 +166,7 @@ def forward( class XLMRobertaCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -223,12 +185,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -271,11 +227,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -287,8 +238,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -310,15 +259,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XLMRobertaAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = XLMRobertaCrossAttention if is_cross_attention else XLMRobertaSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = XLMRobertaSelfOutput(config) self.pruned_heads = set() @@ -407,7 +352,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = XLMRobertaAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -539,7 +483,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -591,11 +534,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -706,8 +649,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 522d63aad884..9b431a144e69 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -71,7 +71,6 @@ def __init__(self, config): self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -123,11 +122,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings embeddings = self.dropout(embeddings) return embeddings @@ -176,38 +174,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -226,7 +199,7 @@ def eager_attention_forward( class XLMRobertaXLSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -245,12 +218,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -289,11 +256,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -305,8 +267,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -314,7 +274,7 @@ def forward( class XLMRobertaXLCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -333,12 +293,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -381,11 +335,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -397,8 +346,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -419,15 +366,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XLMRobertaXLAttention(nn.Module): - def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = XLMRobertaXLCrossAttention if is_cross_attention else XLMRobertaXLSelfAttention - self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = XLMRobertaXLSelfOutput(config) self.pruned_heads = set() @@ -516,7 +459,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = XLMRobertaXLAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, @@ -694,8 +636,6 @@ def __init__(self, config, add_pooling_layer=True): self.pooler = XLMRobertaXLPooler(config) if add_pooling_layer else None - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() diff --git a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py index d4937d424d31..1462625d83a2 100644 --- a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py @@ -102,11 +102,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings embeddings = self.dropout(embeddings) return embeddings @@ -135,9 +134,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XLMRobertaXLAttention(BertAttention): def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False + self, config, is_causal=False, layer_idx=None, is_cross_attention=False ): - super().__init__(config, position_embedding_type, is_causal, layer_idx, is_cross_attention) + super().__init__(config, is_causal, layer_idx, is_cross_attention) del self.LayerNorm self.self_attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index eaf1362d3664..7c7f8ffae58d 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -63,7 +63,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -115,11 +114,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -169,38 +168,13 @@ def eager_attention_forward( scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, - use_cache: Optional[bool] = None, **kwargs: Unpack[TransformersKwargs], ): if scaling is None: scaling = query.size(-1) ** -0.5 # Take the dot product between "query" and "key" to get the raw attention scores. - attn_weights = torch.matmul(query, key.transpose(2, 3)) - - # Relative positional embeddings - if module.position_embedding_type == "relative_key" or module.position_embedding_type == "relative_key_query": - query_length, key_length = query.shape[2], key.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=query.device).view(-1, 1) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=query.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=query.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - attn_weights = attn_weights + relative_position_scores - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - attn_weights = attn_weights + relative_position_scores_query + relative_position_scores_key - - # Scaling is shifted in case of embeddings being relative - attn_weights = attn_weights * scaling + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling if attention_mask is not None and attention_mask.ndim == 4: attention_mask = attention_mask[:, :, :, : key.shape[-2]] @@ -220,7 +194,7 @@ def eager_attention_forward( # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Xmod class XmodSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -239,12 +213,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.is_causal = is_causal @@ -283,11 +251,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -299,8 +262,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(*input_shape, -1).contiguous() @@ -309,7 +270,7 @@ def forward( # Copied from transformers.models.bert.modeling_bert.BertCrossAttention with Bert->Xmod class XmodCrossAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, is_causal=False, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -328,12 +289,6 @@ def __init__(self, config, position_embedding_type=None, is_causal=False, layer_ self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_causal = is_causal self.layer_idx = layer_idx @@ -376,11 +331,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type != "absolute": - raise ValueError( - f"You are using {self.config._attn_implementation} as attention type. However, non-absolute " - 'positional embeddings can not work with them. Please load the model with `attn_implementation="eager"`.' - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( @@ -392,8 +342,6 @@ def forward( dropout=0.0 if not self.training else self.dropout.p, scaling=self.scaling, head_mask=head_mask, - # only for relevant for non-absolute positional embeddings - use_cache=past_key_value is not None, **kwargs, ) attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() @@ -417,13 +365,13 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XmodAttention(nn.Module): def __init__( - self, config, position_embedding_type=None, is_causal=False, layer_idx=None, is_cross_attention=False + self, config, is_causal=False, layer_idx=None, is_cross_attention=False ): super().__init__() self.is_cross_attention = is_cross_attention attention_class = XmodCrossAttention if is_cross_attention else XmodSelfAttention self.self = attention_class( - config, position_embedding_type=position_embedding_type, is_causal=is_causal, layer_idx=layer_idx + config, is_causal=is_causal, layer_idx=layer_idx ) self.output = XmodSelfOutput(config) self.pruned_heads = set() @@ -578,7 +526,6 @@ def __init__(self, config, layer_idx=None): raise ValueError(f"{self} should be used as a decoder model if cross attention is added") self.crossattention = XmodAttention( config, - position_embedding_type="absolute", is_causal=False, layer_idx=layer_idx, is_cross_attention=True, diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index 193143d7b46a..6e0d5ef5603c 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -307,13 +307,6 @@ def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - @slow def test_model_from_pretrained(self): model_name = "albert/albert-base-v1" diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 19094754f8bb..1106231ba5ae 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -496,13 +496,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_3d_mask_shapes(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() # manipulate input_mask @@ -588,12 +581,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) @@ -754,40 +741,6 @@ def test_inference_no_head_absolute_embedding(self): torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) - @slow - def test_inference_no_head_relative_embedding_key(self): - model = BertModel.from_pretrained( - "zhiheng-huang/bert-base-uncased-embedding-relative-key", attn_implementation="eager" - ) - input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) - attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - with torch.no_grad(): - output = model(input_ids, attention_mask=attention_mask)[0] - expected_shape = torch.Size((1, 11, 768)) - self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor( - [[[0.0756, 0.3142, -0.5128], [0.3761, 0.3462, -0.5477], [0.2052, 0.3760, -0.1240]]] - ) - - torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) - - @slow - def test_inference_no_head_relative_embedding_key_query(self): - model = BertModel.from_pretrained( - "zhiheng-huang/bert-base-uncased-embedding-relative-key-query", attn_implementation="eager" - ) - input_ids = torch.tensor([[0, 345, 232, 328, 740, 140, 1695, 69, 6078, 1588, 2]]) - attention_mask = torch.tensor([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - with torch.no_grad(): - output = model(input_ids, attention_mask=attention_mask)[0] - expected_shape = torch.Size((1, 11, 768)) - self.assertEqual(output.shape, expected_shape) - expected_slice = torch.tensor( - [[[0.6496, 0.3784, 0.8203], [0.8148, 0.5656, 0.2636], [-0.0681, 0.5597, 0.7045]]] - ) - - torch.testing.assert_close(output[:, 1:4, 1:4], expected_slice, rtol=1e-4, atol=1e-4) - @slow @pytest.mark.torch_export_test def test_export(self): diff --git a/tests/models/roberta/test_modeling_roberta.py b/tests/models/roberta/test_modeling_roberta.py index 009e9dfc22c1..a26468640524 100644 --- a/tests/models/roberta/test_modeling_roberta.py +++ b/tests/models/roberta/test_modeling_roberta.py @@ -413,13 +413,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) @@ -459,12 +452,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) From e0569a53bf4800ec32253d0021d9c2b9d975c9ee Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 25 Sep 2025 18:57:52 +0200 Subject: [PATCH 2/5] remaining changes --- .../modeling_dummy_bert.py | 745 ++++++++--------- .../modular-transformers/modeling_roberta.py | 750 ++++++++---------- .../modular_dummy_bert.py | 5 +- .../models/albert/configuration_albert.py | 8 - .../models/align/configuration_align.py | 8 - .../models/align/modeling_align.py | 9 +- .../models/altclip/configuration_altclip.py | 8 - .../models/bert/configuration_bert.py | 8 - .../configuration_bert_generation.py | 8 - .../models/blip/modeling_blip_text.py | 27 +- .../models/blip_2/configuration_blip_2.py | 8 - .../models/blip_2/modeling_blip_2.py | 27 +- .../bridgetower/configuration_bridgetower.py | 8 - src/transformers/models/bros/modeling_bros.py | 30 +- .../camembert/configuration_camembert.py | 8 - .../models/canine/modeling_canine.py | 28 +- .../configuration_chinese_clip.py | 8 - .../chinese_clip/modeling_chinese_clip.py | 9 +- .../models/clap/configuration_clap.py | 8 - .../data2vec/configuration_data2vec_text.py | 8 - .../deprecated/ernie_m/modeling_ernie_m.py | 34 +- .../deprecated/qdqbert/modeling_qdqbert.py | 29 +- .../models/deprecated/realm/modeling_realm.py | 47 +- .../models/dpr/configuration_dpr.py | 8 - .../models/electra/configuration_electra.py | 8 - .../models/ernie/configuration_ernie.py | 8 - .../models/esm/configuration_esm.py | 6 +- src/transformers/models/esm/modeling_esm.py | 51 +- .../models/evolla/modeling_evolla.py | 50 +- .../models/evolla/modular_evolla.py | 8 +- .../models/flava/configuration_flava.py | 8 - .../models/flava/modeling_flava.py | 9 +- .../models/git/configuration_git.py | 8 - src/transformers/models/git/modeling_git.py | 41 +- .../models/ibert/configuration_ibert.py | 8 - .../models/ibert/modeling_ibert.py | 19 +- .../configuration_instructblip.py | 8 - .../instructblip/modeling_instructblip.py | 27 +- .../configuration_instructblipvideo.py | 8 - .../modeling_instructblipvideo.py | 27 +- .../models/lilt/configuration_lilt.py | 8 - src/transformers/models/lilt/modeling_lilt.py | 37 +- .../configuration_megatron_bert.py | 8 - .../megatron_bert/modeling_megatron_bert.py | 38 +- .../models/mra/configuration_mra.py | 4 - src/transformers/models/mra/modeling_mra.py | 18 +- .../nystromformer/modeling_nystromformer.py | 18 +- .../models/roberta/configuration_roberta.py | 8 - .../configuration_roberta_prelayernorm.py | 8 - .../models/roc_bert/configuration_roc_bert.py | 8 - .../models/splinter/modeling_splinter.py | 9 +- .../models/superglue/modeling_superglue.py | 32 +- src/transformers/models/vilt/modeling_vilt.py | 9 +- .../xlm_roberta/configuration_xlm_roberta.py | 8 - .../configuration_xlm_roberta_xl.py | 8 - .../models/xmod/configuration_xmod.py | 8 - .../models/yoso/configuration_yoso.py | 4 - src/transformers/models/yoso/modeling_yoso.py | 18 +- .../models/big_bird/test_modeling_big_bird.py | 3 - tests/models/biogpt/test_modeling_biogpt.py | 6 - tests/models/bitnet/test_modeling_bitnet.py | 6 - tests/models/bros/test_modeling_bros.py | 6 - .../test_modeling_chinese_clip.py | 6 - tests/models/cohere/test_modeling_cohere.py | 6 - .../data2vec/test_modeling_data2vec_text.py | 13 - tests/models/dbrx/test_modeling_dbrx.py | 6 - .../deepseek_v3/test_modeling_deepseek_v3.py | 6 - .../diffllama/test_modeling_diffllama.py | 6 - tests/models/electra/test_modeling_electra.py | 7 - .../test_modeling_encoder_decoder.py | 21 - tests/models/ernie/test_modeling_ernie.py | 13 - tests/models/esm/test_modeling_esm.py | 7 - tests/models/flava/test_modeling_flava.py | 3 - tests/models/git/test_modeling_git.py | 6 - tests/models/granite/test_modeling_granite.py | 6 - .../test_modeling_granite_speech.py | 1 - .../granitemoe/test_modeling_granitemoe.py | 6 - .../test_modeling_granitemoeshared.py | 6 - tests/models/ibert/test_modeling_ibert.py | 7 - .../models/layoutlm/test_modeling_layoutlm.py | 6 - .../layoutlmv2/test_modeling_layoutlmv2.py | 6 - .../layoutlmv3/test_modeling_layoutlmv3.py | 6 - tests/models/lilt/test_modeling_lilt.py | 6 - .../modernbert/test_modeling_modernbert.py | 6 - tests/models/mra/test_modeling_mra.py | 6 - .../test_modeling_nystromformer.py | 6 - tests/models/olmo/test_modeling_olmo.py | 6 - tests/models/olmo2/test_modeling_olmo2.py | 6 - tests/models/olmoe/test_modeling_olmoe.py | 6 - tests/models/rembert/test_modeling_rembert.py | 6 - .../test_modeling_roberta_prelayernorm.py | 8 - .../models/roc_bert/test_modeling_roc_bert.py | 13 - .../models/splinter/test_modeling_splinter.py | 6 - .../visual_bert/test_modeling_visual_bert.py | 6 - .../test_modeling_xlm_roberta_xl.py | 13 - tests/models/xmod/test_modeling_xmod.py | 13 - tests/models/yoso/test_modeling_yoso.py | 6 - 97 files changed, 838 insertions(+), 1821 deletions(-) diff --git a/examples/modular-transformers/modeling_dummy_bert.py b/examples/modular-transformers/modeling_dummy_bert.py index 9df092f73e6e..d5cc87230e80 100644 --- a/examples/modular-transformers/modeling_dummy_bert.py +++ b/examples/modular-transformers/modeling_dummy_bert.py @@ -4,24 +4,29 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_dummy_bert.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -import math -from typing import Optional, Union +from typing import Callable, Optional, Union import torch from torch import nn from ...activations import ACT2FN -from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache -from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...cache_utils import Cache, EncoderDecoderCache +from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions -from ...modeling_utils import PreTrainedModel +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg +from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available, logging +from ...utils.generic import check_model_inputs from .configuration_dummy_bert import DummyBertConfig +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + logger = logging.get_logger(__name__) @@ -37,7 +42,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -58,7 +62,7 @@ def forward( else: input_shape = inputs_embeds.size()[:-1] - seq_length = input_shape[1] + batch_size, seq_length = input_shape if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] @@ -68,260 +72,211 @@ def forward( # issue #5664 if token_type_ids is None: if hasattr(self, "token_type_ids"): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded + # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0]) + buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1) + buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids) + token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length) else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + + if attention_mask is not None and attention_mask.ndim == 4: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + class DummyBertSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " f"heads ({config.num_attention_heads})" ) + self.config = config self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder + self.is_causal = is_causal self.layer_idx = layer_idx - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - batch_size, seq_length, _ = hidden_states.shape - query_layer = self.query(hidden_states) - query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( - 1, 2 - ) - - is_updated = False - is_cross_attention = encoder_hidden_states is not None - if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): - is_updated = past_key_values.is_updated.get(self.layer_idx) - if is_cross_attention: - # after the first generated id, we can subsequently re-use all key/value_layer from cache - curr_past_key_value = past_key_values.cross_attention_cache - else: - curr_past_key_value = past_key_values.self_attention_cache - else: - curr_past_key_value = past_key_values - - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if is_cross_attention and past_key_values is not None and is_updated: - # reuse k,v, cross_attentions - key_layer = curr_past_key_value.layers[self.layer_idx].keys - value_layer = curr_past_key_value.layers[self.layer_idx].values - else: - key_layer = self.key(current_states) - key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( - 1, 2 + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.attention_head_size) + + # get all proj + query_layer = self.query(hidden_states).view(*hidden_shape).transpose(1, 2) + key_layer = self.key(hidden_states).view(*hidden_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*hidden_shape).transpose(1, 2) + + if past_key_value is not None: + # decoder-only dummy_bert can have a simple dynamic cache for example + current_past_key_value = past_key_value + if isinstance(past_key_value, EncoderDecoderCache): + current_past_key_value = past_key_value.self_attention_cache + + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + key_layer, value_layer = current_past_key_value.update( + key_layer, + value_layer, + self.layer_idx, + {"cache_position": cache_position}, ) - value_layer = self.value(current_states) - value_layer = value_layer.view( - batch_size, -1, self.num_attention_heads, self.attention_head_size - ).transpose(1, 2) - - if past_key_values is not None: - # save all key/value_layer to cache to be re-used for fast auto-regressive generation - cache_position = cache_position if not is_cross_attention else None - key_layer, value_layer = curr_past_key_value.update( - key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} - ) - # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls - if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): - past_key_values.is_updated[self.layer_idx] = True - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if past_key_values is not None: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in DummyBertModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) + attn_output, attn_weights = attention_interface( + self, + query_layer, + key_layer, + value_layer, + attention_mask, + dropout=0.0 if not self.training else self.dropout.p, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, + ) + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + return attn_output, attn_weights - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - context_layer = torch.matmul(attention_probs, value_layer) +class DummyBertCrossAttention(nn.Module): + def __init__(self, config, is_causal=False, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.config = config - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 - return context_layer, attention_probs + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) -class DummyBertSdpaSelfAttention(DummyBertSelfAttention): - def __init__(self, config, position_embedding_type=None, layer_idx=None): - super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx) - self.dropout_prob = config.attention_probs_dropout_prob + self.is_causal = is_causal + self.layer_idx = layer_idx - # Adapted from DummyBertSelfAttention - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, - cache_position: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: - # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. - logger.warning_once( - "DummyBertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " - "the manual attention implementation, but specifying the manual implementation will be required from " - "Transformers version v5.0.0 onwards. This warning can be removed using the argument " - '`attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - past_key_values, - output_attentions, - cache_position, - ) + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = encoder_hidden_states.shape[1] - bsz, tgt_len, _ = hidden_states.size() + q_input_shape = (bsz, tgt_len, -1, self.attention_head_size) + kv_input_shape = (bsz, src_len, -1, self.attention_head_size) - query_layer = ( - self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) - ) - - is_updated = False - is_cross_attention = encoder_hidden_states is not None - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): - is_updated = past_key_values.is_updated.get(self.layer_idx) - if is_cross_attention: - # after the first generated id, we can subsequently re-use all key/value_states from cache - curr_past_key_value = past_key_values.cross_attention_cache - else: - curr_past_key_value = past_key_values.self_attention_cache - else: - curr_past_key_value = past_key_values + # get query proj + query_layer = self.query(hidden_states).view(*q_input_shape).transpose(1, 2) - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if is_cross_attention and past_key_values is not None and is_updated: + is_updated = past_key_value.is_updated.get(self.layer_idx) if past_key_value is not None else False + if past_key_value is not None and is_updated: # reuse k,v, cross_attentions - key_layer = curr_past_key_value.layers[self.layer_idx].keys - value_layer = curr_past_key_value.layers[self.layer_idx].values + key_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].keys + value_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].values else: - key_layer = ( - self.key(current_states) - .view(bsz, -1, self.num_attention_heads, self.attention_head_size) - .transpose(1, 2) - ) - value_layer = ( - self.value(current_states) - .view(bsz, -1, self.num_attention_heads, self.attention_head_size) - .transpose(1, 2) - ) + key_layer = self.key(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2) + value_layer = self.value(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2) - if past_key_values is not None: - # save all key/value_layer to cache to be re-used for fast auto-regressive generation - cache_position = cache_position if not is_cross_attention else None - key_layer, value_layer = curr_past_key_value.update( - key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + if past_key_value is not None: + # save all states to the cache + key_layer, value_layer = past_key_value.cross_attention_cache.update( + key_layer, value_layer, self.layer_idx ) # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls - if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): - past_key_values.is_updated[self.layer_idx] = True + past_key_value.is_updated[self.layer_idx] = True - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create - # a causal mask in case tgt_len == 1. - is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - attn_output = torch.nn.functional.scaled_dot_product_attention( + attn_output, attn_weights = attention_interface( + self, query_layer, key_layer, value_layer, - attn_mask=attention_mask, - dropout_p=self.dropout_prob if self.training else 0.0, - is_causal=is_causal, + attention_mask, + dropout=0.0 if not self.training else self.dropout.p, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) - - return attn_output, None + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + return attn_output, attn_weights class DummyBertSelfOutput(nn.Module): @@ -338,20 +293,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -DUMMY_BERT_SELF_ATTENTION_CLASSES = { - "eager": DummyBertSelfAttention, - "sdpa": DummyBertSdpaSelfAttention, -} - - class DummyBertAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() - self.self = DUMMY_BERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, - position_embedding_type=position_embedding_type, - layer_idx=layer_idx, - ) + self.is_cross_attention = is_cross_attention + attention_class = DummyBertCrossAttention if is_cross_attention else DummyBertSelfAttention + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = DummyBertSelfOutput(config) self.pruned_heads = set() @@ -373,29 +320,29 @@ def prune_heads(self, heads): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - self_outputs = self.self( + attention_mask = attention_mask if not self.is_cross_attention else encoder_attention_mask + attention_output, attn_weights = self.self( hidden_states, + encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - past_key_values=past_key_values, - output_attentions=output_attentions, + past_key_value=past_key_value, cache_position=cache_position, + **kwargs, ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs + attention_output = self.output(attention_output, hidden_states) + return attention_output, attn_weights class DummyBertIntermediate(nn.Module): @@ -432,17 +379,21 @@ def __init__(self, config, layer_idx=None): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 - self.attention = DummyBertAttention(config, layer_idx=layer_idx) + self.attention = DummyBertAttention(config, is_causal=config.is_decoder, layer_idx=layer_idx) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = DummyBertAttention(config, position_embedding_type="absolute", layer_idx=layer_idx) + self.crossattention = DummyBertAttention( + config, + is_causal=False, + layer_idx=layer_idx, + is_cross_attention=True, + ) self.intermediate = DummyBertIntermediate(config) self.output = DummyBertOutput(config) - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, @@ -450,20 +401,19 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - self_attention_outputs = self.attention( + self_attention_output, _ = self.attention( hidden_states, - attention_mask=attention_mask, - head_mask=head_mask, - output_attentions=output_attentions, - past_key_values=past_key_values, + attention_mask, + head_mask, + past_key_value=past_key_value, cache_position=cache_position, + **kwargs, ) - attention_output = self_attention_outputs[0] - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + attention_output = self_attention_output if self.is_decoder and encoder_hidden_states is not None: if not hasattr(self, "crossattention"): @@ -472,24 +422,21 @@ def forward( " by setting `config.add_cross_attention=True`" ) - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask=encoder_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - past_key_values=past_key_values, - output_attentions=output_attentions, - cache_position=cache_position, + cross_attention_output, _ = self.crossattention( + self_attention_output, + None, # attention_mask + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value=past_key_value, + **kwargs, ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + attention_output = cross_attention_output layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) - outputs = (layer_output,) + outputs - - return outputs + return layer_output def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) @@ -498,11 +445,10 @@ def feed_forward_chunk(self, attention_output): class DummyBertEncoder(nn.Module): - def __init__(self, config, layer_idx=None): + def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([DummyBertLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False def forward( self, @@ -511,79 +457,28 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = False, - output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - if use_cache and self.config.is_decoder and past_key_values is None: - past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) - - if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): - logger.warning_once( - "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " - "You should pass an instance of `EncoderDecoderCache` instead, e.g. " - "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." - ) - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - layer_head_mask = head_mask[i] if head_mask is not None else None - layer_outputs = layer_module( + hidden_states = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, # as a positional argument for gradient checkpointing encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - output_attentions=output_attentions, + past_key_value=past_key_values, cache_position=cache_position, + **kwargs, ) - hidden_states = layer_outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + (layer_outputs[2],) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - past_key_values, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, + past_key_values=past_key_values if use_cache else None, ) @@ -644,10 +539,18 @@ def forward(self, hidden_states): @auto_docstring class DummyBertPreTrainedModel(PreTrainedModel): - config: DummyBertConfig + config_class = DummyBertConfig base_model_prefix = "dummy_bert" supports_gradient_checkpointing = True + _supports_flash_attn = True _supports_sdpa = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": DummyBertLayer, + "attentions": DummyBertSelfAttention, + "cross_attentions": DummyBertCrossAttention, + } def _init_weights(self, module): """Initialize the weights""" @@ -688,15 +591,13 @@ def __init__(self, config, add_pooling_layer=True): """ super().__init__(config) self.config = config + self.gradient_checkpointing = False self.embeddings = DummyBertEmbeddings(config) self.encoder = DummyBertEncoder(config) self.pooler = DummyBertPooler(config) if add_pooling_layer else None - self.attn_implementation = config._attn_implementation - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() @@ -714,6 +615,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @check_model_inputs @auto_docstring def forward( self, @@ -731,46 +633,37 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device - - past_key_values_length = 0 - if past_key_values is not None: - past_key_values_length = ( - past_key_values[0][0].shape[-2] - if not isinstance(past_key_values, Cache) - else past_key_values.get_seq_length() + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." ) + return_legacy_cache = True + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) - if token_type_ids is None: - if hasattr(self.embeddings, "token_type_ids"): - buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if input_ids is not None: + device = input_ids.device + input_shape = input_ids.shape + else: + device = inputs_embeds.device + input_shape = inputs_embeds.shape[:-1] + + seq_length = input_shape[1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device) embedding_output = self.embeddings( input_ids=input_ids, @@ -780,55 +673,16 @@ def forward( past_key_values_length=past_key_values_length, ) - if attention_mask is None: - attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) - - use_sdpa_attention_masks = ( - self.attn_implementation == "sdpa" - and self.position_embedding_type == "absolute" - and head_mask is None - and not output_attentions + attention_mask, encoder_attention_mask = self._create_attention_masks( + input_shape=input_shape, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + embedding_output=embedding_output, + encoder_hidden_states=encoder_hidden_states, + cache_position=cache_position, + past_key_values=past_key_values, ) - # Expand the attention mask - if use_sdpa_attention_masks and attention_mask.dim() == 2: - # Expand the attention mask for SDPA. - # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] - if self.config.is_decoder: - extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - input_shape, - embedding_output, - past_key_values_length, - ) - else: - extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( - attention_mask, embedding_output.dtype, tgt_len=seq_length - ) - else: - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - - if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: - # Expand the attention mask for SDPA. - # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] - encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( - encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length - ) - else: - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None - # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N @@ -838,28 +692,131 @@ def forward( encoder_outputs = self.encoder( embedding_output, - attention_mask=extended_attention_mask, + attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, + encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, cache_position=cache_position, + position_ids=position_ids, + **kwargs, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] + if return_legacy_cache: + encoder_outputs.past_key_values = encoder_outputs.past_key_values.to_legacy_cache() return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + + def _create_attention_masks( + self, + input_shape, + attention_mask, + encoder_attention_mask, + embedding_output, + encoder_hidden_states, + cache_position, + past_key_values, + ): + if attention_mask is not None and attention_mask.dim() == 2: + if self.config.is_decoder: + attention_mask = create_causal_mask( + config=self.config, + input_embeds=embedding_output, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + ) + else: + attention_mask = self._update_full_mask( + attention_mask, + embedding_output, + ) + elif attention_mask is not None and attention_mask.dim() == 3: + if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention": + raise ValueError( + "Passing attention mask with a 3D/4D shape does not work with type " + f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead." + ) + attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + if encoder_attention_mask is not None: + if encoder_attention_mask.dim() == 2: + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + embedding_output.shape[:2], + embedding_output, + ) + else: + if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention": + raise ValueError( + "Passing attention mask with a 3D/4D shape does not work with type " + f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead." + ) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + + return attention_mask, encoder_attention_mask + + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if "flash" in self.config._attn_implementation: + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if "flash" in self.config._attn_implementation: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask diff --git a/examples/modular-transformers/modeling_roberta.py b/examples/modular-transformers/modeling_roberta.py index 2ae39a555892..3a7121822e97 100644 --- a/examples/modular-transformers/modeling_roberta.py +++ b/examples/modular-transformers/modeling_roberta.py @@ -4,24 +4,29 @@ # the file from the modular. If any change should be done, please apply the change to the # modular_roberta.py file directly. One of our CI enforces this. # 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 -import math -from typing import Optional, Union +from typing import Callable, Optional, Union import torch import torch.nn as nn from ...activations import ACT2FN -from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache -from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa, _prepare_4d_causal_attention_mask_for_sdpa +from ...cache_utils import Cache, EncoderDecoderCache +from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, BaseModelOutputWithPoolingAndCrossAttentions -from ...modeling_utils import PreTrainedModel +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import auto_docstring, logging -from ...utils.deprecation import deprecate_kwarg +from ...utils import TransformersKwargs, auto_docstring, is_torch_flex_attn_available, logging +from ...utils.generic import check_model_inputs from .configuration_roberta import RobertaConfig +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + logger = logging.get_logger(__name__) @@ -39,7 +44,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -61,7 +65,7 @@ def forward( else: input_shape = inputs_embeds.size()[:-1] - seq_length = input_shape[1] + batch_size, seq_length = input_shape if position_ids is None: position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] @@ -71,260 +75,211 @@ def forward( # issue #5664 if token_type_ids is None: if hasattr(self, "token_type_ids"): - buffered_token_type_ids = self.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) - token_type_ids = buffered_token_type_ids_expanded + # NOTE: We assume either pos ids to have bsz == 1 (broadcastable) or bsz == effective bsz (input_shape[0]) + buffered_token_type_ids = self.token_type_ids.expand(position_ids.shape[0], -1) + buffered_token_type_ids = torch.gather(buffered_token_type_ids, dim=1, index=position_ids) + token_type_ids = buffered_token_type_ids.expand(batch_size, seq_length) else: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: Optional[float] = None, + dropout: float = 0.0, + head_mask: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], +): + if scaling is None: + scaling = query.size(-1) ** -0.5 + + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling + + if attention_mask is not None and attention_mask.ndim == 4: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + class RobertaSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " f"heads ({config.num_attention_heads})" ) + self.config = config self.num_attention_heads = config.num_attention_heads self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 self.query = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder + self.is_causal = is_causal self.layer_idx = layer_idx - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, - encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - batch_size, seq_length, _ = hidden_states.shape - query_layer = self.query(hidden_states) - query_layer = query_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( - 1, 2 - ) - - is_updated = False - is_cross_attention = encoder_hidden_states is not None - if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): - is_updated = past_key_values.is_updated.get(self.layer_idx) - if is_cross_attention: - # after the first generated id, we can subsequently re-use all key/value_layer from cache - curr_past_key_value = past_key_values.cross_attention_cache - else: - curr_past_key_value = past_key_values.self_attention_cache - else: - curr_past_key_value = past_key_values - - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if is_cross_attention and past_key_values is not None and is_updated: - # reuse k,v, cross_attentions - key_layer = curr_past_key_value.layers[self.layer_idx].keys - value_layer = curr_past_key_value.layers[self.layer_idx].values - else: - key_layer = self.key(current_states) - key_layer = key_layer.view(batch_size, -1, self.num_attention_heads, self.attention_head_size).transpose( - 1, 2 + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.attention_head_size) + + # get all proj + query_layer = self.query(hidden_states).view(*hidden_shape).transpose(1, 2) + key_layer = self.key(hidden_states).view(*hidden_shape).transpose(1, 2) + value_layer = self.value(hidden_states).view(*hidden_shape).transpose(1, 2) + + if past_key_value is not None: + # decoder-only roberta can have a simple dynamic cache for example + current_past_key_value = past_key_value + if isinstance(past_key_value, EncoderDecoderCache): + current_past_key_value = past_key_value.self_attention_cache + + # save all key/value_layer to cache to be re-used for fast auto-regressive generation + key_layer, value_layer = current_past_key_value.update( + key_layer, + value_layer, + self.layer_idx, + {"cache_position": cache_position}, ) - value_layer = self.value(current_states) - value_layer = value_layer.view( - batch_size, -1, self.num_attention_heads, self.attention_head_size - ).transpose(1, 2) - - if past_key_values is not None: - # save all key/value_layer to cache to be re-used for fast auto-regressive generation - cache_position = cache_position if not is_cross_attention else None - key_layer, value_layer = curr_past_key_value.update( - key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} - ) - # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls - if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): - past_key_values.is_updated[self.layer_idx] = True - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if past_key_values is not None: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - - attention_scores = attention_scores / math.sqrt(self.attention_head_size) - if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) - attention_scores = attention_scores + attention_mask - - # Normalize the attention scores to probabilities. - attention_probs = nn.functional.softmax(attention_scores, dim=-1) - - # This is actually dropping out entire tokens to attend to, which might - # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) + attn_output, attn_weights = attention_interface( + self, + query_layer, + key_layer, + value_layer, + attention_mask, + dropout=0.0 if not self.training else self.dropout.p, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, + ) + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + return attn_output, attn_weights - # Mask heads if we want to - if head_mask is not None: - attention_probs = attention_probs * head_mask - context_layer = torch.matmul(attention_probs, value_layer) +class RobertaCrossAttention(nn.Module): + def __init__(self, config, is_causal=False, layer_idx=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.config = config - context_layer = context_layer.permute(0, 2, 1, 3).contiguous() - new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) - context_layer = context_layer.view(new_context_layer_shape) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.scaling = self.attention_head_size**-0.5 - return context_layer, attention_probs + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) -class RobertaSdpaSelfAttention(RobertaSelfAttention): - def __init__(self, config, position_embedding_type=None, layer_idx=None): - super().__init__(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx) - self.dropout_prob = config.attention_probs_dropout_prob + self.is_causal = is_causal + self.layer_idx = layer_idx - # Adapted from RobertaSelfAttention - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, - cache_position: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[EncoderDecoderCache] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: - # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. - logger.warning_once( - "RobertaSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " - "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " - "the manual attention implementation, but specifying the manual implementation will be required from " - "Transformers version v5.0.0 onwards. This warning can be removed using the argument " - '`attn_implementation="eager"` when loading the model.' - ) - return super().forward( - hidden_states, - attention_mask, - head_mask, - encoder_hidden_states, - past_key_values, - output_attentions, - cache_position, - ) + # determine input shapes + bsz, tgt_len = hidden_states.shape[:-1] + src_len = encoder_hidden_states.shape[1] - bsz, tgt_len, _ = hidden_states.size() + q_input_shape = (bsz, tgt_len, -1, self.attention_head_size) + kv_input_shape = (bsz, src_len, -1, self.attention_head_size) - query_layer = ( - self.query(hidden_states).view(bsz, -1, self.num_attention_heads, self.attention_head_size).transpose(1, 2) - ) - - is_updated = False - is_cross_attention = encoder_hidden_states is not None - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if past_key_values is not None: - if isinstance(past_key_values, EncoderDecoderCache): - is_updated = past_key_values.is_updated.get(self.layer_idx) - if is_cross_attention: - # after the first generated id, we can subsequently re-use all key/value_states from cache - curr_past_key_value = past_key_values.cross_attention_cache - else: - curr_past_key_value = past_key_values.self_attention_cache - else: - curr_past_key_value = past_key_values + # get query proj + query_layer = self.query(hidden_states).view(*q_input_shape).transpose(1, 2) - current_states = encoder_hidden_states if is_cross_attention else hidden_states - if is_cross_attention and past_key_values is not None and is_updated: + is_updated = past_key_value.is_updated.get(self.layer_idx) if past_key_value is not None else False + if past_key_value is not None and is_updated: # reuse k,v, cross_attentions - key_layer = curr_past_key_value.layers[self.layer_idx].keys - value_layer = curr_past_key_value.layers[self.layer_idx].values + key_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].keys + value_layer = past_key_value.cross_attention_cache.layers[self.layer_idx].values else: - key_layer = ( - self.key(current_states) - .view(bsz, -1, self.num_attention_heads, self.attention_head_size) - .transpose(1, 2) - ) - value_layer = ( - self.value(current_states) - .view(bsz, -1, self.num_attention_heads, self.attention_head_size) - .transpose(1, 2) - ) + key_layer = self.key(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2) + value_layer = self.value(encoder_hidden_states).view(*kv_input_shape).transpose(1, 2) - if past_key_values is not None: - # save all key/value_layer to cache to be re-used for fast auto-regressive generation - cache_position = cache_position if not is_cross_attention else None - key_layer, value_layer = curr_past_key_value.update( - key_layer, value_layer, self.layer_idx, {"cache_position": cache_position} + if past_key_value is not None: + # save all states to the cache + key_layer, value_layer = past_key_value.cross_attention_cache.update( + key_layer, value_layer, self.layer_idx ) # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls - if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache): - past_key_values.is_updated[self.layer_idx] = True + past_key_value.is_updated[self.layer_idx] = True - # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment - # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. - # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create - # a causal mask in case tgt_len == 1. - is_causal = self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] - attn_output = torch.nn.functional.scaled_dot_product_attention( + attn_output, attn_weights = attention_interface( + self, query_layer, key_layer, value_layer, - attn_mask=attention_mask, - dropout_p=self.dropout_prob if self.training else 0.0, - is_causal=is_causal, + attention_mask, + dropout=0.0 if not self.training else self.dropout.p, + scaling=self.scaling, + head_mask=head_mask, + **kwargs, ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) - - return attn_output, None + attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous() + return attn_output, attn_weights class RobertaSelfOutput(nn.Module): @@ -341,20 +296,12 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to return hidden_states -ROBERTA_SELF_ATTENTION_CLASSES = { - "eager": RobertaSelfAttention, - "sdpa": RobertaSdpaSelfAttention, -} - - class RobertaAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() - self.self = ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, - position_embedding_type=position_embedding_type, - layer_idx=layer_idx, - ) + self.is_cross_attention = is_cross_attention + attention_class = RobertaCrossAttention if is_cross_attention else RobertaSelfAttention + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = RobertaSelfOutput(config) self.pruned_heads = set() @@ -376,29 +323,29 @@ def prune_heads(self, heads): self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads self.pruned_heads = self.pruned_heads.union(heads) - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.FloatTensor] = None, head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - self_outputs = self.self( + attention_mask = attention_mask if not self.is_cross_attention else encoder_attention_mask + attention_output, attn_weights = self.self( hidden_states, + encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask, head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - past_key_values=past_key_values, - output_attentions=output_attentions, + past_key_value=past_key_value, cache_position=cache_position, + **kwargs, ) - attention_output = self.output(self_outputs[0], hidden_states) - outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them - return outputs + attention_output = self.output(attention_output, hidden_states) + return attention_output, attn_weights class RobertaIntermediate(nn.Module): @@ -435,17 +382,21 @@ def __init__(self, config, layer_idx=None): super().__init__() self.chunk_size_feed_forward = config.chunk_size_feed_forward self.seq_len_dim = 1 - self.attention = RobertaAttention(config, layer_idx=layer_idx) + self.attention = RobertaAttention(config, is_causal=config.is_decoder, layer_idx=layer_idx) self.is_decoder = config.is_decoder self.add_cross_attention = config.add_cross_attention if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = RobertaAttention(config, position_embedding_type="absolute", layer_idx=layer_idx) + self.crossattention = RobertaAttention( + config, + is_causal=False, + layer_idx=layer_idx, + is_cross_attention=True, + ) self.intermediate = RobertaIntermediate(config) self.output = RobertaOutput(config) - @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( self, hidden_states: torch.Tensor, @@ -453,20 +404,19 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[Cache] = None, - output_attentions: Optional[bool] = False, + past_key_value: Optional[Cache] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> tuple[torch.Tensor]: - self_attention_outputs = self.attention( + self_attention_output, _ = self.attention( hidden_states, - attention_mask=attention_mask, - head_mask=head_mask, - output_attentions=output_attentions, - past_key_values=past_key_values, + attention_mask, + head_mask, + past_key_value=past_key_value, cache_position=cache_position, + **kwargs, ) - attention_output = self_attention_outputs[0] - outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + attention_output = self_attention_output if self.is_decoder and encoder_hidden_states is not None: if not hasattr(self, "crossattention"): @@ -475,24 +425,21 @@ def forward( " by setting `config.add_cross_attention=True`" ) - cross_attention_outputs = self.crossattention( - attention_output, - attention_mask=encoder_attention_mask, - head_mask=head_mask, - encoder_hidden_states=encoder_hidden_states, - past_key_values=past_key_values, - output_attentions=output_attentions, - cache_position=cache_position, + cross_attention_output, _ = self.crossattention( + self_attention_output, + None, # attention_mask + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value=past_key_value, + **kwargs, ) - attention_output = cross_attention_outputs[0] - outputs = outputs + cross_attention_outputs[1:] # add cross attentions if we output attention weights + attention_output = cross_attention_output layer_output = apply_chunking_to_forward( self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output ) - outputs = (layer_output,) + outputs - - return outputs + return layer_output def feed_forward_chunk(self, attention_output): intermediate_output = self.intermediate(attention_output) @@ -501,11 +448,10 @@ def feed_forward_chunk(self, attention_output): class RobertaEncoder(nn.Module): - def __init__(self, config, layer_idx=None): + def __init__(self, config): super().__init__() self.config = config self.layer = nn.ModuleList([RobertaLayer(config, layer_idx=i) for i in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False def forward( self, @@ -514,79 +460,28 @@ def forward( head_mask: Optional[torch.FloatTensor] = None, encoder_hidden_states: Optional[torch.FloatTensor] = None, encoder_attention_mask: Optional[torch.FloatTensor] = None, - past_key_values: Optional[tuple[tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[Cache] = None, use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = False, - output_hidden_states: Optional[bool] = False, - return_dict: Optional[bool] = True, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: - all_hidden_states = () if output_hidden_states else None - all_self_attentions = () if output_attentions else None - all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None - - if self.gradient_checkpointing and self.training: - if use_cache: - logger.warning_once( - "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." - ) - use_cache = False - - if use_cache and self.config.is_decoder and past_key_values is None: - past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config)) - - if use_cache and self.config.is_decoder and isinstance(past_key_values, tuple): - logger.warning_once( - "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " - "You should pass an instance of `EncoderDecoderCache` instead, e.g. " - "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." - ) - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) - for i, layer_module in enumerate(self.layer): - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - layer_head_mask = head_mask[i] if head_mask is not None else None - layer_outputs = layer_module( + hidden_states = layer_module( hidden_states, attention_mask, layer_head_mask, encoder_hidden_states, # as a positional argument for gradient checkpointing encoder_attention_mask=encoder_attention_mask, - past_key_values=past_key_values, - output_attentions=output_attentions, + past_key_value=past_key_values, cache_position=cache_position, + **kwargs, ) - hidden_states = layer_outputs[0] - if output_attentions: - all_self_attentions = all_self_attentions + (layer_outputs[1],) - if self.config.add_cross_attention: - all_cross_attentions = all_cross_attentions + (layer_outputs[2],) - - if output_hidden_states: - all_hidden_states = all_hidden_states + (hidden_states,) - - if not return_dict: - return tuple( - v - for v in [ - hidden_states, - past_key_values, - all_hidden_states, - all_self_attentions, - all_cross_attentions, - ] - if v is not None - ) return BaseModelOutputWithPastAndCrossAttentions( last_hidden_state=hidden_states, - past_key_values=past_key_values, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=all_cross_attentions, + past_key_values=past_key_values if use_cache else None, ) @@ -647,10 +542,18 @@ def forward(self, hidden_states): @auto_docstring class RobertaPreTrainedModel(PreTrainedModel): - config: RobertaConfig + config_class = RobertaConfig base_model_prefix = "roberta" supports_gradient_checkpointing = True + _supports_flash_attn = True _supports_sdpa = True + _supports_flex_attn = True + _supports_attention_backend = True + _can_record_outputs = { + "hidden_states": RobertaLayer, + "attentions": RobertaSelfAttention, + "cross_attentions": RobertaCrossAttention, + } def _init_weights(self, module): """Initialize the weights""" @@ -691,15 +594,13 @@ def __init__(self, config, add_pooling_layer=True): """ super().__init__(config) self.config = config + self.gradient_checkpointing = False self.embeddings = RobertaEmbeddings(config) self.encoder = RobertaEncoder(config) self.pooler = RobertaPooler(config) if add_pooling_layer else None - self.attn_implementation = config._attn_implementation - self.position_embedding_type = config.position_embedding_type - # Initialize weights and apply final processing self.post_init() @@ -717,6 +618,7 @@ class PreTrainedModel for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) + @check_model_inputs @auto_docstring def forward( self, @@ -728,52 +630,40 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, encoder_hidden_states: Optional[torch.Tensor] = None, encoder_attention_mask: Optional[torch.Tensor] = None, - past_key_values: Optional[list[torch.FloatTensor]] = None, + past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None, use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) - input_shape = input_ids.size() - elif inputs_embeds is not None: - input_shape = inputs_embeds.size()[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - batch_size, seq_length = input_shape - device = input_ids.device if input_ids is not None else inputs_embeds.device - - past_key_values_length = 0 - if past_key_values is not None: - past_key_values_length = ( - past_key_values[0][0].shape[-2] - if not isinstance(past_key_values, Cache) - else past_key_values.get_seq_length() + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + logger.warning_once( + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. " + "You should pass an instance of `EncoderDecoderCache` instead, e.g. " + "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`." ) + return_legacy_cache = True + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) - if token_type_ids is None: - if hasattr(self.embeddings, "token_type_ids"): - buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] - buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) - token_type_ids = buffered_token_type_ids_expanded - else: - token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if input_ids is not None: + device = input_ids.device + input_shape = input_ids.shape + else: + device = inputs_embeds.device + input_shape = inputs_embeds.shape[:-1] + + seq_length = input_shape[1] + past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + if cache_position is None: + cache_position = torch.arange(past_key_values_length, past_key_values_length + seq_length, device=device) embedding_output = self.embeddings( input_ids=input_ids, @@ -783,55 +673,16 @@ def forward( past_key_values_length=past_key_values_length, ) - if attention_mask is None: - attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) - - use_sdpa_attention_masks = ( - self.attn_implementation == "sdpa" - and self.position_embedding_type == "absolute" - and head_mask is None - and not output_attentions + attention_mask, encoder_attention_mask = self._create_attention_masks( + input_shape=input_shape, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + embedding_output=embedding_output, + encoder_hidden_states=encoder_hidden_states, + cache_position=cache_position, + past_key_values=past_key_values, ) - # Expand the attention mask - if use_sdpa_attention_masks and attention_mask.dim() == 2: - # Expand the attention mask for SDPA. - # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] - if self.config.is_decoder: - extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( - attention_mask, - input_shape, - embedding_output, - past_key_values_length, - ) - else: - extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( - attention_mask, embedding_output.dtype, tgt_len=seq_length - ) - else: - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) - - if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: - # Expand the attention mask for SDPA. - # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] - encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( - encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length - ) - else: - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None - # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N @@ -841,28 +692,131 @@ def forward( encoder_outputs = self.encoder( embedding_output, - attention_mask=extended_attention_mask, + attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, + encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, cache_position=cache_position, + position_ids=position_ids, + **kwargs, ) - sequence_output = encoder_outputs[0] + sequence_output = encoder_outputs.last_hidden_state pooled_output = self.pooler(sequence_output) if self.pooler is not None else None - if not return_dict: - return (sequence_output, pooled_output) + encoder_outputs[1:] + if return_legacy_cache: + encoder_outputs.past_key_values = encoder_outputs.past_key_values.to_legacy_cache() return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - cross_attentions=encoder_outputs.cross_attentions, ) + + def _create_attention_masks( + self, + input_shape, + attention_mask, + encoder_attention_mask, + embedding_output, + encoder_hidden_states, + cache_position, + past_key_values, + ): + if attention_mask is not None and attention_mask.dim() == 2: + if self.config.is_decoder: + attention_mask = create_causal_mask( + config=self.config, + input_embeds=embedding_output, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + ) + else: + attention_mask = self._update_full_mask( + attention_mask, + embedding_output, + ) + elif attention_mask is not None and attention_mask.dim() == 3: + if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention": + raise ValueError( + "Passing attention mask with a 3D/4D shape does not work with type " + f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead." + ) + attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + if encoder_attention_mask is not None: + if encoder_attention_mask.dim() == 2: + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + embedding_output.shape[:2], + embedding_output, + ) + else: + if "flash" in self.config._attn_implementation or self.config._attn_implementation == "flex_attention": + raise ValueError( + "Passing attention mask with a 3D/4D shape does not work with type " + f"{self.config._attn_implementation} - please use either `sdpa` or `eager` instead." + ) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + + return attention_mask, encoder_attention_mask + + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if "flash" in self.config._attn_implementation: + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if "flash" in self.config._attn_implementation: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask diff --git a/examples/modular-transformers/modular_dummy_bert.py b/examples/modular-transformers/modular_dummy_bert.py index fb7440228d8c..f3205d2af03e 100644 --- a/examples/modular-transformers/modular_dummy_bert.py +++ b/examples/modular-transformers/modular_dummy_bert.py @@ -5,6 +5,8 @@ from transformers.models.bert.modeling_bert import BertModel from ...modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions +from ...processing_utils import Unpack +from ...utils import TransformersKwargs class DummyBertModel(BertModel): @@ -24,5 +26,6 @@ def forward( output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, cache_position: Optional[torch.Tensor] = None, + **kwargs: Unpack[TransformersKwargs], ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: - return super().forward(input_ids) + return super().forward(input_ids, **kwargs) diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py index b60c19d504f0..110ad8cb138d 100644 --- a/src/transformers/models/albert/configuration_albert.py +++ b/src/transformers/models/albert/configuration_albert.py @@ -68,12 +68,6 @@ class AlbertConfig(PretrainedConfig): The epsilon used by the layer normalization layers. classifier_dropout_prob (`float`, *optional*, defaults to 0.1): The dropout ratio for attached classifiers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). pad_token_id (`int`, *optional*, defaults to 0): Padding token id. bos_token_id (`int`, *optional*, defaults to 2): @@ -123,7 +117,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, classifier_dropout_prob=0.1, - position_embedding_type="absolute", pad_token_id=0, bos_token_id=2, eos_token_id=3, @@ -147,7 +140,6 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.classifier_dropout_prob = classifier_dropout_prob - self.position_embedding_type = position_embedding_type # Copied from transformers.models.bert.configuration_bert.BertOnnxConfig with Roberta->Albert diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index b924d85a6ca6..3eaeffdc6f23 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -62,12 +62,6 @@ class AlignTextConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Padding token id. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -105,7 +99,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, **kwargs, ): @@ -123,7 +116,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.pad_token_id = pad_token_id diff --git a/src/transformers/models/align/modeling_align.py b/src/transformers/models/align/modeling_align.py index 839856b92119..040a2b1c9592 100644 --- a/src/transformers/models/align/modeling_align.py +++ b/src/transformers/models/align/modeling_align.py @@ -519,7 +519,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -558,11 +557,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 5e8c0f2a262e..43554aee3de3 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -67,12 +67,6 @@ class AltCLIPTextConfig(PretrainedConfig): bos_token_id (`int`, *optional*, defaults to 0): The id of the *beginning-of-sequence* token. eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2): The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -114,7 +108,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, project_dim=768, **kwargs, @@ -134,7 +127,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.project_dim = project_dim diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index e7e51d3295ef..28367fbb8ff8 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -65,12 +65,6 @@ class BertConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -111,7 +105,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -130,7 +123,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index e6cf054cc5e2..b604378418a6 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -60,12 +60,6 @@ class BertGenerationConfig(PretrainedConfig): Beginning of stream token id. eos_token_id (`int`, *optional*, defaults to 1): End of stream token id. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -103,7 +97,6 @@ def __init__( pad_token_id=0, bos_token_id=2, eos_token_id=1, - position_embedding_type="absolute", use_cache=True, **kwargs, ): @@ -120,7 +113,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 99026a2b4fd0..0c553d990cdb 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -56,7 +56,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config @@ -82,9 +81,9 @@ def forward( embeddings = inputs_embeds - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -115,10 +114,6 @@ def __init__(self, config, is_cross_attention, layer_idx=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) def save_attn_gradients(self, attn_gradients): self.attn_gradients = attn_gradients @@ -199,22 +194,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 23145ffc543f..f95aa409e60c 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -146,12 +146,6 @@ class Blip2QFormerConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Index to be used for padding token. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). cross_attention_frequency (`int`, *optional*, defaults to 2): The frequency of adding cross-attention to the Transformer layers. encoder_hidden_size (`int`, *optional*, defaults to 1408): @@ -190,7 +184,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", cross_attention_frequency=2, encoder_hidden_size=1408, use_qformer_text_input=False, @@ -209,7 +202,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size self.use_qformer_text_input = use_qformer_text_input diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index b552df47f2fc..de0de6ace68d 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -551,10 +551,6 @@ def __init__(self, config, is_cross_attention=False): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -603,22 +599,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: @@ -884,7 +864,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( self, @@ -903,9 +882,9 @@ def forward( if input_ids is not None: input_ids = input_ids.to(self.word_embeddings.weight.device) embeddings = self.word_embeddings(input_ids) - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings if query_embeds is not None: # `query_embeds` are kept in fp32 when we use it with Qformer diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 4c84b0a294da..fc363be9d57a 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -133,12 +133,6 @@ class BridgeTowerTextConfig(PretrainedConfig): testing). layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -177,7 +171,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, **kwargs, ): @@ -195,7 +188,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id diff --git a/src/transformers/models/bros/modeling_bros.py b/src/transformers/models/bros/modeling_bros.py index 5f5dd05ff82d..fc3a50e8320b 100755 --- a/src/transformers/models/bros/modeling_bros.py +++ b/src/transformers/models/bros/modeling_bros.py @@ -129,7 +129,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) self.register_buffer( "token_type_ids", @@ -169,11 +168,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -197,10 +196,6 @@ def __init__(self, config): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -233,23 +228,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - # bbox positional encoding batch_size, n_head, seq_length, d_head = query_layer.shape bbox_pos_emb = bbox_pos_emb.view(seq_length, seq_length, batch_size, d_head) diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py index 3979e5487443..9f3b71da0904 100644 --- a/src/transformers/models/camembert/configuration_camembert.py +++ b/src/transformers/models/camembert/configuration_camembert.py @@ -65,12 +65,6 @@ class CamembertConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -113,7 +107,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -132,7 +125,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 545919dc7b77..e4837f5a22ec 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -106,7 +106,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int): """ @@ -171,12 +170,11 @@ def forward( ) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.char_position_embeddings(position_ids) - embeddings += position_embeddings + position_embeddings = self.char_position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -300,10 +298,6 @@ def __init__(self, config): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) def forward( self, @@ -338,22 +332,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = from_tensor.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: if attention_mask.ndim == 3: diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 5b9c31965585..11a28bac6034 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -75,12 +75,6 @@ class ChineseCLIPTextConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Padding token id. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -119,7 +113,6 @@ def __init__( initializer_factor=1.0, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, **kwargs, ): @@ -138,7 +131,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache diff --git a/src/transformers/models/chinese_clip/modeling_chinese_clip.py b/src/transformers/models/chinese_clip/modeling_chinese_clip.py index a689886abc37..5764981c065d 100644 --- a/src/transformers/models/chinese_clip/modeling_chinese_clip.py +++ b/src/transformers/models/chinese_clip/modeling_chinese_clip.py @@ -101,7 +101,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -140,11 +139,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 900e8d373f5a..7e4ee719bc9a 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -58,12 +58,6 @@ class ClapTextConfig(PretrainedConfig): The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`]. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -111,7 +105,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, projection_hidden_act="relu", **kwargs, @@ -130,7 +123,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_factor = initializer_factor self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.projection_hidden_act = projection_hidden_act self.projection_dim = projection_dim diff --git a/src/transformers/models/data2vec/configuration_data2vec_text.py b/src/transformers/models/data2vec/configuration_data2vec_text.py index f9518d67bf66..8ab16458c5d8 100644 --- a/src/transformers/models/data2vec/configuration_data2vec_text.py +++ b/src/transformers/models/data2vec/configuration_data2vec_text.py @@ -64,12 +64,6 @@ class Data2VecTextConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -112,7 +106,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -131,7 +124,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py index 4cecdf5728a3..2f960bc52b83 100755 --- a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -88,7 +88,7 @@ def forward( class ErnieMSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -105,12 +105,6 @@ def __init__(self, config, position_embedding_type=None): self.v_proj = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -171,28 +165,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function) @@ -223,9 +195,9 @@ def forward( class ErnieMAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type) + self.self_attn = ErnieMSelfAttention(config) self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) self.pruned_heads = set() diff --git a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py index f522a1d72154..b924d76eb6fc 100755 --- a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py @@ -82,7 +82,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -122,11 +121,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -150,10 +149,6 @@ def __init__(self, config): self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -220,22 +215,6 @@ def forward( self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2)) ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in QDQBertModel forward() function) diff --git a/src/transformers/models/deprecated/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py index 69bab60f6803..4fa585dcc1a9 100644 --- a/src/transformers/models/deprecated/realm/modeling_realm.py +++ b/src/transformers/models/deprecated/realm/modeling_realm.py @@ -57,7 +57,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -97,18 +96,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class RealmSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -125,12 +124,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -191,28 +184,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if use_cache: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in RealmModel forward() function) @@ -262,11 +233,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class RealmAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, position_embedding_type=position_embedding_type - ) + self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation](config) self.output = RealmSelfOutput(config) self.pruned_heads = set() @@ -353,7 +322,7 @@ def __init__(self, config): if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = RealmAttention(config, position_embedding_type="absolute") + self.crossattention = RealmAttention(config) self.intermediate = RealmIntermediate(config) self.output = RealmOutput(config) diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py index 03b169002493..4b310b673f63 100644 --- a/src/transformers/models/dpr/configuration_dpr.py +++ b/src/transformers/models/dpr/configuration_dpr.py @@ -64,12 +64,6 @@ class DPRConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Padding token id. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). projection_dim (`int`, *optional*, defaults to 0): Dimension of the projection for the context and question encoders. If it is set to zero (default), then no projection is done. @@ -106,7 +100,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", projection_dim: int = 0, **kwargs, ): @@ -125,7 +118,6 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.projection_dim = projection_dim - self.position_embedding_type = position_embedding_type __all__ = ["DPRConfig"] diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py index f12756d976b3..481925519c07 100644 --- a/src/transformers/models/electra/configuration_electra.py +++ b/src/transformers/models/electra/configuration_electra.py @@ -89,12 +89,6 @@ class ElectraConfig(PretrainedConfig): Argument used when doing sequence summary. Used in the sequence classification and multiple choice models. The dropout ratio to be used after the projection and activation. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -138,7 +132,6 @@ def __init__( summary_activation="gelu", summary_last_dropout=0.1, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -163,7 +156,6 @@ def __init__( self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_last_dropout = summary_last_dropout - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/ernie/configuration_ernie.py b/src/transformers/models/ernie/configuration_ernie.py index abf300f0ce51..e7f5bd1aff1a 100644 --- a/src/transformers/models/ernie/configuration_ernie.py +++ b/src/transformers/models/ernie/configuration_ernie.py @@ -71,12 +71,6 @@ class ErnieConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Padding token id. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -117,7 +111,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -138,7 +131,6 @@ def __init__( self.use_task_id = use_task_id self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/esm/configuration_esm.py b/src/transformers/models/esm/configuration_esm.py index fabfb4ebd6d3..afd5ee255ad0 100644 --- a/src/transformers/models/esm/configuration_esm.py +++ b/src/transformers/models/esm/configuration_esm.py @@ -67,11 +67,7 @@ class EsmConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query", "rotary"`. - For positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). + Type of position embedding. Choose either `"absolute"` or "rotary"`. is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 3524b221a0ec..973e17c6ade6 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -251,45 +251,29 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): return position_ids.unsqueeze(0).expand(input_shape) +# Copied from transformers.models.bert.modeling_bert.eager_attention_forward def eager_attention_forward( module: nn.Module, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attention_mask: Optional[torch.Tensor], - scaling: float, + scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[TransformersKwargs], ): - # ESM applies relative position embeddings and we don't copy from Llama + if scaling is None: + scaling = query.size(-1) ** -0.5 + + # Take the dot product between "query" and "key" to get the raw attention scores. attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling - if hasattr(module, "position_embedding_type") and module.position_embedding_type in [ - "relative_key", - "relative_key_query", - ]: - seq_length = query.shape[2] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - relative_position_scores = relative_position_scores_query + relative_position_scores_key - - attn_weights = attn_weights + relative_position_scores - - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key.shape[-2]] - attn_weights = attn_weights + causal_mask - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + if attention_mask is not None and attention_mask.ndim == 4: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) if head_mask is not None: @@ -321,14 +305,12 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cros self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = config.attention_probs_dropout_prob + + self.rotary_embeddings = None self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) - self.rotary_embeddings = None - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - elif self.position_embedding_type == "rotary": + if self.position_embedding_type == "rotary": self.rotary_embeddings = RotaryEmbedding(dim=self.attention_head_size) self.scaling = 1.0 # For BC we apply scaling before RoPE @@ -367,11 +349,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type in ["relative_key", "relative_key_query"]: - raise ValueError( - f"ESM {self.config._attn_implementation} attention does not support {self.position_embedding_type} embeddings. " - "Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`" - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index 8bb5713d1764..c3fb683b7a95 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -225,39 +225,22 @@ def eager_attention_forward( key: torch.Tensor, value: torch.Tensor, attention_mask: Optional[torch.Tensor], - scaling: float, + scaling: Optional[float] = None, dropout: float = 0.0, head_mask: Optional[torch.Tensor] = None, **kwargs: Unpack[TransformersKwargs], ): - # EVOLLA_SA_PROT applies relative position embeddings and we don't copy from Llama + if scaling is None: + scaling = query.size(-1) ** -0.5 + + # Take the dot product between "query" and "key" to get the raw attention scores. attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling - if hasattr(module, "position_embedding_type") and module.position_embedding_type in [ - "relative_key", - "relative_key_query", - ]: - seq_length = query.shape[2] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=attn_weights.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = module.distance_embedding(distance + module.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility - - if module.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - elif module.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key, positional_embedding) - relative_position_scores = relative_position_scores_query + relative_position_scores_key - - attn_weights = attn_weights + relative_position_scores - - if attention_mask is not None: - causal_mask = attention_mask[:, :, :, : key.shape[-2]] - attn_weights = attn_weights + causal_mask - - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + if attention_mask is not None and attention_mask.ndim == 4: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) if head_mask is not None: @@ -288,15 +271,11 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cros self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = config.attention_probs_dropout_prob + self.rotary_embeddings = None self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) - self.rotary_embeddings = None - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - elif self.position_embedding_type == "rotary": + if self.position_embedding_type == "rotary": self.rotary_embeddings = EvollaSaProtRotaryEmbedding(dim=self.attention_head_size) self.is_decoder = config.is_decoder @@ -335,11 +314,6 @@ def forward( attention_interface: Callable = eager_attention_forward if self.config._attn_implementation != "eager": - if self.position_embedding_type in ["relative_key", "relative_key_query"]: - raise ValueError( - f"ESM {self.config._attn_implementation} attention does not support {self.position_embedding_type} embeddings. " - "Set attention explicitly to 'eager' with `model.set_attn_implementation('eager')`" - ) attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index e2db43a7d787..cdbfacbe5c01 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -144,15 +144,11 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cros self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) - self.dropout = config.attention_probs_dropout_prob + self.rotary_embeddings = None self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" ) - self.rotary_embeddings = None - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) - elif self.position_embedding_type == "rotary": + if self.position_embedding_type == "rotary": self.rotary_embeddings = EvollaSaProtRotaryEmbedding(dim=self.attention_head_size) self.is_decoder = config.is_decoder diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index c3ecf68a8982..98d60cfaf9e5 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -148,12 +148,6 @@ class FlavaTextConfig(PretrainedConfig): max_position_embeddings (`int`, *optional*, defaults to 512): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). For VL, max_length passed to model is 77. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -205,7 +199,6 @@ def __init__( vocab_size: int = 30522, type_vocab_size: int = 2, max_position_embeddings: int = 512, - position_embedding_type: str = "absolute", hidden_size: int = 768, num_hidden_layers: int = 12, num_attention_heads: int = 12, @@ -224,7 +217,6 @@ def __init__( self.vocab_size = vocab_size self.type_vocab_size = type_vocab_size self.max_position_embeddings = max_position_embeddings - self.position_embedding_type = position_embedding_type self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads diff --git a/src/transformers/models/flava/modeling_flava.py b/src/transformers/models/flava/modeling_flava.py index c48f2ca1279f..d997325727b1 100644 --- a/src/transformers/models/flava/modeling_flava.py +++ b/src/transformers/models/flava/modeling_flava.py @@ -378,7 +378,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -411,11 +410,11 @@ def forward( inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index 86c85854ff98..2854e005a9f8 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -140,12 +140,6 @@ class GitConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). num_image_with_embedding (`int`, *optional*): @@ -184,7 +178,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, tie_word_embeddings=False, bos_token_id=101, @@ -210,7 +203,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.tie_word_embeddings = tie_word_embeddings self.num_image_with_embedding = num_image_with_embedding diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 82a1d5e451ca..2d34f1a58653 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -79,7 +79,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -106,16 +105,16 @@ def forward( else: embeddings = inputs_embeds - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class GitSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -142,12 +141,6 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58") def forward( @@ -188,28 +181,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if past_key_values is not None: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in GitModel forward() function) @@ -256,10 +227,10 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class GitAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, layer_idx=None): super().__init__() self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, position_embedding_type=position_embedding_type, layer_idx=layer_idx + config, layer_idx=layer_idx ) self.output = GitSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py index 963e6e6c9ed0..6c5023ce45b3 100644 --- a/src/transformers/models/ibert/configuration_ibert.py +++ b/src/transformers/models/ibert/configuration_ibert.py @@ -65,12 +65,6 @@ class IBertConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). quant_mode (`bool`, *optional*, defaults to `False`): Whether to quantize the model or not. force_dequant (`str`, *optional*, defaults to `"none"`): @@ -100,7 +94,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", quant_mode=False, force_dequant="none", **kwargs, @@ -119,7 +112,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.quant_mode = quant_mode self.force_dequant = force_dequant diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index e1b3c7fb966c..7c879bd704e9 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -73,7 +73,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy self.padding_idx = config.pad_token_id @@ -132,14 +131,13 @@ def forward( identity_scaling_factor=token_type_embeddings_scaling_factor, ) - if self.position_embedding_type == "absolute": - position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids) - embeddings, embeddings_scaling_factor = self.embeddings_act1( - embeddings, - embeddings_scaling_factor, - identity=position_embeddings, - identity_scaling_factor=position_embeddings_scaling_factor, - ) + position_embeddings, position_embeddings_scaling_factor = self.position_embeddings(position_ids) + embeddings, embeddings_scaling_factor = self.embeddings_act1( + embeddings, + embeddings_scaling_factor, + identity=position_embeddings, + identity_scaling_factor=position_embeddings_scaling_factor, + ) embeddings, embeddings_scaling_factor = self.LayerNorm(embeddings, embeddings_scaling_factor) embeddings = self.dropout(embeddings) @@ -217,9 +215,6 @@ def __init__(self, config): self.output_activation = QuantAct(self.act_bit, quant_mode=self.quant_mode) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type != "absolute": - raise ValueError("I-BERT only supports 'absolute' for `config.position_embedding_type`") self.softmax = IntSoftmax(self.act_bit, quant_mode=self.quant_mode, force_dequant=config.force_dequant) diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index 9b8323f15f05..56e6fc60e574 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -146,12 +146,6 @@ class InstructBlipQFormerConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Token id used for padding sequences. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). cross_attention_frequency (`int`, *optional*, defaults to 2): The frequency of adding cross-attention to the Transformer layers. encoder_hidden_size (`int`, *optional*, defaults to 1408): @@ -188,7 +182,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", cross_attention_frequency=2, encoder_hidden_size=1408, **kwargs, @@ -206,7 +199,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index 20c0def10fd1..23220c19087d 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -456,10 +456,6 @@ def __init__(self, config, is_cross_attention=False): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -508,22 +504,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores_dtype = attention_scores.dtype @@ -791,7 +771,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config @@ -812,9 +791,9 @@ def forward( if input_ids is not None: embeddings = self.word_embeddings(input_ids) - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) - embeddings = embeddings + position_embeddings + + position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) + embeddings = embeddings + position_embeddings if query_embeds is not None: embeddings = torch.cat((query_embeds, embeddings), dim=1) diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index af2acc833876..340f04cb2327 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -152,12 +152,6 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): The epsilon used by the layer normalization layers. pad_token_id (`int`, *optional*, defaults to 0): Token id used for padding sequences. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). cross_attention_frequency (`int`, *optional*, defaults to 2): The frequency of adding cross-attention to the Transformer layers. encoder_hidden_size (`int`, *optional*, defaults to 1408): @@ -194,7 +188,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", cross_attention_frequency=2, encoder_hidden_size=1408, **kwargs, @@ -212,7 +205,6 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index 863e22e82b17..c4bf9aca191b 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -423,10 +423,6 @@ def __init__(self, config, is_cross_attention=False): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.save_attention = False def save_attn_gradients(self, attn_gradients): @@ -475,22 +471,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores_dtype = attention_scores.dtype @@ -753,7 +733,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.config = config @@ -774,9 +753,9 @@ def forward( if input_ids is not None: embeddings = self.word_embeddings(input_ids) - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) - embeddings = embeddings + position_embeddings + + position_embeddings = self.position_embeddings(position_ids.to(embeddings.device)) + embeddings = embeddings + position_embeddings if query_embeds is not None: embeddings = torch.cat((query_embeds, embeddings), dim=1) diff --git a/src/transformers/models/lilt/configuration_lilt.py b/src/transformers/models/lilt/configuration_lilt.py index 940fad4aa810..76bdc6094703 100644 --- a/src/transformers/models/lilt/configuration_lilt.py +++ b/src/transformers/models/lilt/configuration_lilt.py @@ -58,12 +58,6 @@ class LiltConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). classifier_dropout (`float`, *optional*): The dropout ratio for the classification head. channel_shrink_ratio (`int`, *optional*, defaults to 4): @@ -102,7 +96,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", classifier_dropout=None, channel_shrink_ratio=4, max_2d_position_embeddings=1024, @@ -122,7 +115,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.classifier_dropout = classifier_dropout self.channel_shrink_ratio = channel_shrink_ratio self.max_2d_position_embeddings = max_2d_position_embeddings diff --git a/src/transformers/models/lilt/modeling_lilt.py b/src/transformers/models/lilt/modeling_lilt.py index ac1e4d0544bf..3114af4a3d74 100644 --- a/src/transformers/models/lilt/modeling_lilt.py +++ b/src/transformers/models/lilt/modeling_lilt.py @@ -53,7 +53,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") # End copy self.padding_idx = config.pad_token_id @@ -88,11 +87,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings, position_ids @@ -183,7 +182,7 @@ def forward(self, bbox=None, position_ids=None): class LiltSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -210,12 +209,6 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None): ) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.channel_shrink_ratio = config.channel_shrink_ratio self.layer_idx = layer_idx @@ -246,22 +239,6 @@ def forward( attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) layout_attention_scores = torch.matmul(layout_query_layer, layout_key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - seq_length = hidden_states.size()[1] - position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - tmp_attention_scores = attention_scores / math.sqrt(self.attention_head_size) tmp_layout_attention_scores = layout_attention_scores / math.sqrt( self.attention_head_size // self.channel_shrink_ratio @@ -336,9 +313,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class LiltAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, layer_idx=None): super().__init__() - self.self = LiltSelfAttention(config, position_embedding_type=position_embedding_type, layer_idx=layer_idx) + self.self = LiltSelfAttention(config, layer_idx=layer_idx) self.output = LiltSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py index 1505388e2925..f44404d9f76d 100644 --- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py +++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py @@ -60,12 +60,6 @@ class MegatronBertConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -104,7 +98,6 @@ def __init__( initializer_range=0.02, layer_norm_eps=1e-12, pad_token_id=0, - position_embedding_type="absolute", use_cache=True, **kwargs, ): @@ -122,7 +115,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 121ae19850ff..82f107cbf6c1 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -66,7 +66,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( self, @@ -92,11 +91,10 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings # Megatron BERT moves that layer norm after the drop-out (and to each layer). # embeddings = self.LayerNorm(embeddings) @@ -106,7 +104,7 @@ def forward( # copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->MegatronBert class MegatronBertSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None, layer_idx=None): + def __init__(self, config, layer_idx=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -123,12 +121,6 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder self.layer_idx = layer_idx @@ -191,28 +183,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - if past_key_values is not None: - position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( - -1, 1 - ) - else: - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in MegatronBertModel forward() function) diff --git a/src/transformers/models/mra/configuration_mra.py b/src/transformers/models/mra/configuration_mra.py index 16b064c98f7e..c87e9a291893 100644 --- a/src/transformers/models/mra/configuration_mra.py +++ b/src/transformers/models/mra/configuration_mra.py @@ -60,8 +60,6 @@ class MraConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. block_per_row (`int`, *optional*, defaults to 4): Used to set the budget for the high resolution scale. approx_mode (`str`, *optional*, defaults to `"full"`): @@ -103,7 +101,6 @@ def __init__( type_vocab_size=1, initializer_range=0.02, layer_norm_eps=1e-5, - position_embedding_type="absolute", block_per_row=4, approx_mode="full", initial_prior_first_n_blocks=0, @@ -127,7 +124,6 @@ def __init__( self.initializer_range = initializer_range self.type_vocab_size = type_vocab_size self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.block_per_row = block_per_row self.approx_mode = approx_mode self.initial_prior_first_n_blocks = initial_prior_first_n_blocks diff --git a/src/transformers/models/mra/modeling_mra.py b/src/transformers/models/mra/modeling_mra.py index 6612336b6794..679e6900e97e 100644 --- a/src/transformers/models/mra/modeling_mra.py +++ b/src/transformers/models/mra/modeling_mra.py @@ -474,7 +474,6 @@ def __init__(self, config): # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -506,18 +505,18 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class MraSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -541,9 +540,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = ( - position_embedding_type if position_embedding_type is not None else config.position_embedding_type - ) self.num_block = (config.max_position_embeddings // 32) * config.block_per_row self.num_block = min(self.num_block, int((config.max_position_embeddings // 32) ** 2)) @@ -631,9 +627,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class MraAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = MraSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = MraSelfAttention(config) self.output = MraSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/nystromformer/modeling_nystromformer.py b/src/transformers/models/nystromformer/modeling_nystromformer.py index 03c134ccadae..7798c96af15c 100755 --- a/src/transformers/models/nystromformer/modeling_nystromformer.py +++ b/src/transformers/models/nystromformer/modeling_nystromformer.py @@ -59,7 +59,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -91,18 +90,18 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class NystromformerSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -128,9 +127,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) if self.conv_kernel_size is not None: self.conv = nn.Conv2d( @@ -253,9 +249,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class NystromformerAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = NystromformerSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = NystromformerSelfAttention(config) self.output = NystromformerSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py index 04917804a225..3af141bd0044 100644 --- a/src/transformers/models/roberta/configuration_roberta.py +++ b/src/transformers/models/roberta/configuration_roberta.py @@ -65,12 +65,6 @@ class RobertaConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -113,7 +107,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -132,7 +125,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py index 72bc808c450d..1e95076743dc 100644 --- a/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/configuration_roberta_prelayernorm.py @@ -66,12 +66,6 @@ class RobertaPreLayerNormConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -114,7 +108,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -133,7 +126,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/roc_bert/configuration_roc_bert.py b/src/transformers/models/roc_bert/configuration_roc_bert.py index 75f83e11a799..3aaa75a5cac3 100644 --- a/src/transformers/models/roc_bert/configuration_roc_bert.py +++ b/src/transformers/models/roc_bert/configuration_roc_bert.py @@ -65,12 +65,6 @@ class RoCBertConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). classifier_dropout (`float`, *optional*): The dropout ratio for the classification head. enable_pronunciation (`bool`, *optional*, defaults to `True`): @@ -124,7 +118,6 @@ def __init__( layer_norm_eps=1e-12, use_cache=True, pad_token_id=0, - position_embedding_type="absolute", classifier_dropout=None, enable_pronunciation=True, enable_shape=True, @@ -155,7 +148,6 @@ def __init__( self.shape_embed_dim = shape_embed_dim self.shape_vocab_size = shape_vocab_size self.concat_input = concat_input - self.position_embedding_type = position_embedding_type self.classifier_dropout = classifier_dropout super().__init__(pad_token_id=pad_token_id, **kwargs) diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 1d9c02877841..83726e948b5e 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -53,7 +53,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") def forward( self, @@ -78,11 +77,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/superglue/modeling_superglue.py b/src/transformers/models/superglue/modeling_superglue.py index 4fc524314e89..f57b1e793dfa 100644 --- a/src/transformers/models/superglue/modeling_superglue.py +++ b/src/transformers/models/superglue/modeling_superglue.py @@ -234,7 +234,7 @@ def forward( class SuperGlueSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -251,12 +251,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = position_embedding_type or getattr( - config, "position_embedding_type", "absolute" - ) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - self.max_position_embeddings = config.max_position_embeddings - self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -296,23 +290,6 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) - if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": - query_length, key_length = query_layer.shape[2], key_layer.shape[2] - position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) - position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) - distance = position_ids_l - position_ids_r - - positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) - positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility - - if self.position_embedding_type == "relative_key": - relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores - elif self.position_embedding_type == "relative_key_query": - relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) - relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) - attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key - attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in SuperGlueModel forward() function) @@ -358,12 +335,9 @@ def forward(self, hidden_states: torch.Tensor, *args) -> torch.Tensor: class SuperGlueAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = SUPERGLUE_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, - position_embedding_type=position_embedding_type, - ) + self.self = SUPERGLUE_SELF_ATTENTION_CLASSES[config._attn_implementation](config) self.output = SuperGlueSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/vilt/modeling_vilt.py b/src/transformers/models/vilt/modeling_vilt.py index 8535b3c747e2..d72911c2a392 100755 --- a/src/transformers/models/vilt/modeling_vilt.py +++ b/src/transformers/models/vilt/modeling_vilt.py @@ -232,7 +232,6 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -265,11 +264,11 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py index 97d6245cb1d7..4cf64e9e1884 100644 --- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py @@ -66,12 +66,6 @@ class XLMRobertaConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -114,7 +108,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -133,7 +126,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py index 4111a61d4e26..a7dda1b9b318 100644 --- a/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/configuration_xlm_roberta_xl.py @@ -65,12 +65,6 @@ class XLMRobertaXLConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. @@ -111,7 +105,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, **kwargs, @@ -129,7 +122,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout diff --git a/src/transformers/models/xmod/configuration_xmod.py b/src/transformers/models/xmod/configuration_xmod.py index 41bad38a45de..8a0f77e278a9 100644 --- a/src/transformers/models/xmod/configuration_xmod.py +++ b/src/transformers/models/xmod/configuration_xmod.py @@ -65,12 +65,6 @@ class XmodConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For - positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to - [Self-Attention with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155). - For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models - with Better Relative Position Embeddings (Huang et al.)](https://huggingface.co/papers/2009.13658). is_decoder (`bool`, *optional*, defaults to `False`): Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. use_cache (`bool`, *optional*, defaults to `True`): @@ -128,7 +122,6 @@ def __init__( pad_token_id=1, bos_token_id=0, eos_token_id=2, - position_embedding_type="absolute", use_cache=True, classifier_dropout=None, pre_norm=False, @@ -154,7 +147,6 @@ def __init__( self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_cache = use_cache self.classifier_dropout = classifier_dropout self.pre_norm = pre_norm diff --git a/src/transformers/models/yoso/configuration_yoso.py b/src/transformers/models/yoso/configuration_yoso.py index 9a7fb1218e40..e3efb9d09bd2 100644 --- a/src/transformers/models/yoso/configuration_yoso.py +++ b/src/transformers/models/yoso/configuration_yoso.py @@ -60,8 +60,6 @@ class YosoConfig(PretrainedConfig): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. - position_embedding_type (`str`, *optional*, defaults to `"absolute"`): - Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. use_expectation (`bool`, *optional*, defaults to `True`): Whether or not to use YOSO Expectation. Overrides any effect of num_hash. hash_code_len (`int`, *optional*, defaults to 9): @@ -106,7 +104,6 @@ def __init__( type_vocab_size=1, initializer_range=0.02, layer_norm_eps=1e-12, - position_embedding_type="absolute", use_expectation=True, hash_code_len=9, num_hash=64, @@ -132,7 +129,6 @@ def __init__( self.initializer_range = initializer_range self.type_vocab_size = type_vocab_size self.layer_norm_eps = layer_norm_eps - self.position_embedding_type = position_embedding_type self.use_expectation = use_expectation self.hash_code_len = hash_code_len self.num_hash = num_hash diff --git a/src/transformers/models/yoso/modeling_yoso.py b/src/transformers/models/yoso/modeling_yoso.py index b1d8e5e752a1..4ff302360917 100644 --- a/src/transformers/models/yoso/modeling_yoso.py +++ b/src/transformers/models/yoso/modeling_yoso.py @@ -244,7 +244,6 @@ def __init__(self, config): self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + 2, persistent=False ) - self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), @@ -276,18 +275,18 @@ def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - if self.position_embedding_type == "absolute": - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class YosoSelfAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -310,9 +309,6 @@ def __init__(self, config, position_embedding_type=None): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) - self.position_embedding_type = ( - position_embedding_type if position_embedding_type is not None else config.position_embedding_type - ) self.use_expectation = config.use_expectation self.hash_code_len = config.hash_code_len @@ -449,9 +445,9 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class YosoAttention(nn.Module): - def __init__(self, config, position_embedding_type=None): + def __init__(self, config): super().__init__() - self.self = YosoSelfAttention(config, position_embedding_type=position_embedding_type) + self.self = YosoSelfAttention(config) self.output = YosoSelfOutput(config) self.pruned_heads = set() diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index 8ec874d0f7a8..0969ac297bc2 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -70,7 +70,6 @@ def __init__( rescale_embeddings=False, block_size=8, num_rand_blocks=3, - position_embedding_type="absolute", scope=None, ): self.parent = parent @@ -101,7 +100,6 @@ def __init__( self.rescale_embeddings = rescale_embeddings self.block_size = block_size self.num_rand_blocks = num_rand_blocks - self.position_embedding_type = position_embedding_type def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) @@ -145,7 +143,6 @@ def get_config(self): rescale_embeddings=self.rescale_embeddings, block_size=self.block_size, num_random_blocks=self.num_rand_blocks, - position_embedding_type=self.position_embedding_type, ) def prepare_config_and_inputs_for_decoder(self): diff --git a/tests/models/biogpt/test_modeling_biogpt.py b/tests/models/biogpt/test_modeling_biogpt.py index 26f2053a93aa..ece5c3c9918c 100644 --- a/tests/models/biogpt/test_modeling_biogpt.py +++ b/tests/models/biogpt/test_modeling_biogpt.py @@ -283,12 +283,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_biogpt_model_att_mask_past(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_biogpt_model_attention_mask_past(*config_and_inputs) diff --git a/tests/models/bitnet/test_modeling_bitnet.py b/tests/models/bitnet/test_modeling_bitnet.py index 19bc0c45eb2e..8c1729f8ca85 100644 --- a/tests/models/bitnet/test_modeling_bitnet.py +++ b/tests/models/bitnet/test_modeling_bitnet.py @@ -169,12 +169,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @require_torch class BitNetIntegrationTest(unittest.TestCase): diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py index 8f3f5957e02e..92ad3781fdb2 100644 --- a/tests/models/bros/test_modeling_bros.py +++ b/tests/models/bros/test_modeling_bros.py @@ -353,12 +353,6 @@ def test_model(self): def test_multi_gpu_data_parallel_forward(self): super().test_multi_gpu_data_parallel_forward() - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index dc8e9a145b08..8a99322ced59 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -343,12 +343,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 436d1f9d4226..682d23f89c5f 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -189,12 +189,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_torch_fx_output_loss(self): super().test_torch_fx_output_loss() diff --git a/tests/models/data2vec/test_modeling_data2vec_text.py b/tests/models/data2vec/test_modeling_data2vec_text.py index 59f86c88cd6c..a21254fedddf 100644 --- a/tests/models/data2vec/test_modeling_data2vec_text.py +++ b/tests/models/data2vec/test_modeling_data2vec_text.py @@ -404,13 +404,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) @@ -450,12 +443,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/dbrx/test_modeling_dbrx.py b/tests/models/dbrx/test_modeling_dbrx.py index d194c74e7b43..c94f3ace6c2e 100644 --- a/tests/models/dbrx/test_modeling_dbrx.py +++ b/tests/models/dbrx/test_modeling_dbrx.py @@ -98,12 +98,6 @@ class DbrxModelTest(CausalLMModelTest, unittest.TestCase): ) model_tester_class = DbrxModelTester - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @slow def test_model_from_pretrained(self): model_name = "eitanturok/dbrx-tiny" diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index df97dc4a0af4..ab54ce5a40dd 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -282,12 +282,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("yarn",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/diffllama/test_modeling_diffllama.py b/tests/models/diffllama/test_modeling_diffllama.py index 9938135281fe..9955ca837317 100644 --- a/tests/models/diffllama/test_modeling_diffllama.py +++ b/tests/models/diffllama/test_modeling_diffllama.py @@ -219,12 +219,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_diffllama_sequence_classification_model(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config.num_labels = 3 diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py index 3a1823cc8c01..0c1afe7612e6 100644 --- a/tests/models/electra/test_modeling_electra.py +++ b/tests/models/electra/test_modeling_electra.py @@ -439,13 +439,6 @@ def test_electra_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_electra_model_as_decoder(*config_and_inputs) - def test_electra_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_electra_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_electra_for_masked_lm(*config_and_inputs) diff --git a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py index a6b12a9f65ae..8105f8d0adbb 100644 --- a/tests/models/encoder_decoder/test_modeling_encoder_decoder.py +++ b/tests/models/encoder_decoder/test_modeling_encoder_decoder.py @@ -801,27 +801,6 @@ def prepare_config_and_inputs(self): "labels": decoder_token_labels, } - def test_relative_position_embeds(self): - config_and_inputs = self.prepare_config_and_inputs() - - encoder_config = config_and_inputs["config"] - decoder_config = config_and_inputs["decoder_config"] - - encoder_config._attn_implementation = "eager" - decoder_config._attn_implementation = "eager" - encoder_config.position_embedding_type = "relative_key_query" - decoder_config.position_embedding_type = "relative_key_query" - - encoder_model, decoder_model = self.get_encoder_decoder_model(encoder_config, decoder_config) - model = EncoderDecoderModel(encoder=encoder_model, decoder=decoder_model).eval().to(torch_device) - model.config._attn_implementation = "eager" # model config -> won't work - - logits = model( - input_ids=config_and_inputs["input_ids"], decoder_input_ids=config_and_inputs["decoder_input_ids"] - ).logits - - self.assertTrue(logits.shape, (13, 7)) - @slow def test_bert2bert_summarization(self): model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") diff --git a/tests/models/ernie/test_modeling_ernie.py b/tests/models/ernie/test_modeling_ernie.py index a500a32e3236..36f75dbc7df6 100644 --- a/tests/models/ernie/test_modeling_ernie.py +++ b/tests/models/ernie/test_modeling_ernie.py @@ -488,13 +488,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) @@ -542,12 +535,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_multiple_choice(*config_and_inputs) diff --git a/tests/models/esm/test_modeling_esm.py b/tests/models/esm/test_modeling_esm.py index 72ef77c88c0a..c4cbf971f036 100644 --- a/tests/models/esm/test_modeling_esm.py +++ b/tests/models/esm/test_modeling_esm.py @@ -234,13 +234,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 896ce256955a..ab384fec2e71 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -339,7 +339,6 @@ def __init__( vocab_size=102, type_vocab_size=2, max_position_embeddings=512, - position_embedding_type="absolute", hidden_size=32, num_hidden_layers=2, num_attention_heads=4, @@ -361,7 +360,6 @@ def __init__( self.vocab_size = vocab_size self.type_vocab_size = type_vocab_size self.max_position_embeddings = max_position_embeddings - self.position_embedding_type = position_embedding_type self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads @@ -402,7 +400,6 @@ def get_config(self): vocab_size=self.vocab_size, type_vocab_size=self.type_vocab_size, max_position_embeddings=self.max_position_embeddings, - position_embedding_type=self.position_embedding_type, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 931b3fcc8f07..0633d1221120 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -417,12 +417,6 @@ def test_batched_generate_captioning(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester._test_batched_generate_captioning(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def _check_attentions_for_generate( self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values ): diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 12100a8f3e6c..783fa0b8e451 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -198,12 +198,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/granite_speech/test_modeling_granite_speech.py b/tests/models/granite_speech/test_modeling_granite_speech.py index adb925934548..6b8343a62ef6 100644 --- a/tests/models/granite_speech/test_modeling_granite_speech.py +++ b/tests/models/granite_speech/test_modeling_granite_speech.py @@ -108,7 +108,6 @@ def __init__( "model_type": "blip_2_qformer", "num_attention_heads": 4, "num_hidden_layers": 2, - "position_embedding_type": "absolute", "use_qformer_text_input": False, "vocab_size": 30522, }, diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 7a27474b1b49..f999e9bd0cd3 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -197,12 +197,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py index 2186985408b4..b377e40caeaa 100644 --- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py +++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -200,12 +200,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling_from_config(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/ibert/test_modeling_ibert.py b/tests/models/ibert/test_modeling_ibert.py index 9065a7046b6d..8d7bcf613fd1 100644 --- a/tests/models/ibert/test_modeling_ibert.py +++ b/tests/models/ibert/test_modeling_ibert.py @@ -265,13 +265,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - # I-BERT only supports absolute embedding - for type in ["absolute"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index a7cd87015609..422aaa22eb7b 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -256,12 +256,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py index 2c1b157e3a90..47905d034a73 100644 --- a/tests/models/layoutlmv2/test_modeling_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_modeling_layoutlmv2.py @@ -306,12 +306,6 @@ def test_model(self): def test_multi_gpu_data_parallel_forward(self): pass - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) diff --git a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py index e63ec1b5eb9d..fedbd1975649 100644 --- a/tests/models/layoutlmv3/test_modeling_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_modeling_layoutlmv3.py @@ -354,12 +354,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_sequence_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py index 949649a503df..d88f47b3c0d6 100644 --- a/tests/models/lilt/test_modeling_lilt.py +++ b/tests/models/lilt/test_modeling_lilt.py @@ -265,12 +265,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_token_classification(*config_and_inputs) diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py index b1f0ce468a38..ff59c0f15565 100644 --- a/tests/models/modernbert/test_modeling_modernbert.py +++ b/tests/models/modernbert/test_modeling_modernbert.py @@ -294,12 +294,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index ccb725baa78f..82613f9a724f 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -289,12 +289,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/nystromformer/test_modeling_nystromformer.py b/tests/models/nystromformer/test_modeling_nystromformer.py index 7966b03c0f60..ad6bd19cf430 100644 --- a/tests/models/nystromformer/test_modeling_nystromformer.py +++ b/tests/models/nystromformer/test_modeling_nystromformer.py @@ -253,12 +253,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/olmo/test_modeling_olmo.py b/tests/models/olmo/test_modeling_olmo.py index d8b26fe02228..cd31c0882811 100644 --- a/tests/models/olmo/test_modeling_olmo.py +++ b/tests/models/olmo/test_modeling_olmo.py @@ -194,12 +194,6 @@ def test_model(self): def test_headmasking(self): pass - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/olmo2/test_modeling_olmo2.py b/tests/models/olmo2/test_modeling_olmo2.py index eddf63ae1e05..a993f12f839d 100644 --- a/tests/models/olmo2/test_modeling_olmo2.py +++ b/tests/models/olmo2/test_modeling_olmo2.py @@ -195,12 +195,6 @@ def test_model(self): def test_headmasking(self): pass - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/olmoe/test_modeling_olmoe.py b/tests/models/olmoe/test_modeling_olmoe.py index e9d6bb8df8ba..99b5c4a88fdd 100644 --- a/tests/models/olmoe/test_modeling_olmoe.py +++ b/tests/models/olmoe/test_modeling_olmoe.py @@ -206,12 +206,6 @@ def test_model(self): def test_headmasking(self): pass - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - @parameterized.expand([("linear",), ("dynamic",)]) def test_model_rope_scaling(self, scaling_type): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py index 93a16866601b..e142f866f202 100644 --- a/tests/models/rembert/test_modeling_rembert.py +++ b/tests/models/rembert/test_modeling_rembert.py @@ -381,12 +381,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py index 7605be9e2c84..83f8da9781a7 100644 --- a/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py +++ b/tests/models/roberta_prelayernorm/test_modeling_roberta_prelayernorm.py @@ -413,14 +413,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_various_embeddings - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - # Copied from tests.models.roberta.test_modeling_roberta.RobertaModelTest.test_model_as_decoder def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py index 23a6017168a3..9dd1fc3b1390 100644 --- a/tests/models/roc_bert/test_modeling_roc_bert.py +++ b/tests/models/roc_bert/test_modeling_roc_bert.py @@ -629,13 +629,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) @@ -648,12 +641,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index f8a8121c40d1..6dcd2bf8c5ca 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -283,12 +283,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index 09c96a2467b0..a49419dd321f 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -522,12 +522,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_for_pretraining(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_pretraining() self.model_tester.create_and_check_for_pretraining(*config_and_inputs) diff --git a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py index 6ab20ba5feb0..f62d076e0217 100644 --- a/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py +++ b/tests/models/xlm_roberta_xl/test_modeling_xlm_roberta_xl.py @@ -420,13 +420,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) @@ -466,12 +459,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/xmod/test_modeling_xmod.py b/tests/models/xmod/test_modeling_xmod.py index 298c7ad3a27b..15666f03c1a4 100644 --- a/tests/models/xmod/test_modeling_xmod.py +++ b/tests/models/xmod/test_modeling_xmod.py @@ -418,13 +418,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) @@ -464,12 +457,6 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_decoder_model_past_with_large_inputs_relative_pos_emb(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() - config_and_inputs[0].position_embedding_type = "relative_key" - config_and_inputs[0]._attn_implementation = "eager" - self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) diff --git a/tests/models/yoso/test_modeling_yoso.py b/tests/models/yoso/test_modeling_yoso.py index 621cb184e84e..f827928dcf09 100644 --- a/tests/models/yoso/test_modeling_yoso.py +++ b/tests/models/yoso/test_modeling_yoso.py @@ -287,12 +287,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - def test_model_various_embeddings(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - for type in ["absolute", "relative_key", "relative_key_query"]: - config_and_inputs[0].position_embedding_type = type - self.model_tester.create_and_check_model(*config_and_inputs) - def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) From 34c360551429e4cc163c404e62aaeedf4b71b98f Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 25 Sep 2025 19:00:18 +0200 Subject: [PATCH 3/5] style / copies --- src/transformers/models/bert/modeling_bert.py | 8 ++------ .../models/deprecated/ernie_m/modeling_ernie_m.py | 1 - .../models/deprecated/realm/modeling_realm.py | 1 - src/transformers/models/git/modeling_git.py | 4 +--- .../roberta_prelayernorm/modeling_roberta_prelayernorm.py | 8 ++------ .../models/xlm_roberta_xl/modular_xlm_roberta_xl.py | 4 +--- src/transformers/models/xmod/modeling_xmod.py | 8 ++------ 7 files changed, 8 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 54db194baf46..cc83cfa70999 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -319,15 +319,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class BertAttention(nn.Module): - def __init__( - self, config, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = BertCrossAttention if is_cross_attention else BertSelfAttention - self.self = attention_class( - config, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = BertSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py index 2f960bc52b83..304b9b888154 100755 --- a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -151,7 +151,6 @@ def forward( query_layer = self.transpose_for_scores(mixed_query_layer) - use_cache = past_key_values is not None if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention diff --git a/src/transformers/models/deprecated/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py index 4fa585dcc1a9..dc992cfd0aed 100644 --- a/src/transformers/models/deprecated/realm/modeling_realm.py +++ b/src/transformers/models/deprecated/realm/modeling_realm.py @@ -170,7 +170,6 @@ def forward( query_layer = self.transpose_for_scores(mixed_query_layer) - use_cache = past_key_values is not None if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention diff --git a/src/transformers/models/git/modeling_git.py b/src/transformers/models/git/modeling_git.py index 2d34f1a58653..eeb7f351402c 100644 --- a/src/transformers/models/git/modeling_git.py +++ b/src/transformers/models/git/modeling_git.py @@ -229,9 +229,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class GitAttention(nn.Module): def __init__(self, config, layer_idx=None): super().__init__() - self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation]( - config, layer_idx=layer_idx - ) + self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) self.output = GitSelfOutput(config) self.pruned_heads = set() diff --git a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py index 169961fc77d5..ae58b0281b5f 100644 --- a/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py +++ b/src/transformers/models/roberta_prelayernorm/modeling_roberta_prelayernorm.py @@ -363,15 +363,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class RobertaPreLayerNormAttention(nn.Module): - def __init__( - self, config, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = RobertaPreLayerNormCrossAttention if is_cross_attention else RobertaPreLayerNormSelfAttention - self.self = attention_class( - config, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = RobertaPreLayerNormSelfOutput(config) self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.pruned_heads = set() diff --git a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py index 1462625d83a2..62f66e9f91dd 100644 --- a/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modular_xlm_roberta_xl.py @@ -133,9 +133,7 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XLMRobertaXLAttention(BertAttention): - def __init__( - self, config, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__(config, is_causal, layer_idx, is_cross_attention) del self.LayerNorm diff --git a/src/transformers/models/xmod/modeling_xmod.py b/src/transformers/models/xmod/modeling_xmod.py index 7c7f8ffae58d..cad084f22475 100644 --- a/src/transformers/models/xmod/modeling_xmod.py +++ b/src/transformers/models/xmod/modeling_xmod.py @@ -364,15 +364,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class XmodAttention(nn.Module): - def __init__( - self, config, is_causal=False, layer_idx=None, is_cross_attention=False - ): + def __init__(self, config, is_causal=False, layer_idx=None, is_cross_attention=False): super().__init__() self.is_cross_attention = is_cross_attention attention_class = XmodCrossAttention if is_cross_attention else XmodSelfAttention - self.self = attention_class( - config, is_causal=is_causal, layer_idx=layer_idx - ) + self.self = attention_class(config, is_causal=is_causal, layer_idx=layer_idx) self.output = XmodSelfOutput(config) self.pruned_heads = set() self.pre_norm = config.pre_norm From 0dbd18b0b357560531a5d45415b97f4e17269fe6 Mon Sep 17 00:00:00 2001 From: vasqu Date: Thu, 25 Sep 2025 20:24:40 +0200 Subject: [PATCH 4/5] revert deprecated models and fixup some models --- .../deprecated/ernie_m/modeling_ernie_m.py | 35 +++++- .../deprecated/qdqbert/modeling_qdqbert.py | 29 ++++- .../models/deprecated/realm/modeling_realm.py | 48 +++++++-- src/transformers/models/esm/modeling_esm.py | 96 +++++++++++++---- .../models/evolla/modeling_evolla.py | 101 +++++++----------- .../models/evolla/modular_evolla.py | 100 +++++++---------- 6 files changed, 253 insertions(+), 156 deletions(-) diff --git a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py index 304b9b888154..4cecdf5728a3 100755 --- a/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py +++ b/src/transformers/models/deprecated/ernie_m/modeling_ernie_m.py @@ -88,7 +88,7 @@ def forward( class ErnieMSelfAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, position_embedding_type=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -105,6 +105,12 @@ def __init__(self, config): self.v_proj = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -151,6 +157,7 @@ def forward( query_layer = self.transpose_for_scores(mixed_query_layer) + use_cache = past_key_values is not None if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention @@ -164,6 +171,28 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function) @@ -194,9 +223,9 @@ def forward( class ErnieMAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, position_embedding_type=None): super().__init__() - self.self_attn = ErnieMSelfAttention(config) + self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type) self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) self.pruned_heads = set() diff --git a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py index b924d76eb6fc..f522a1d72154 100755 --- a/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py +++ b/src/transformers/models/deprecated/qdqbert/modeling_qdqbert.py @@ -82,6 +82,7 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -121,11 +122,11 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings @@ -149,6 +150,10 @@ def __init__(self, config): self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -215,6 +220,22 @@ def forward( self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2)) ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in QDQBertModel forward() function) diff --git a/src/transformers/models/deprecated/realm/modeling_realm.py b/src/transformers/models/deprecated/realm/modeling_realm.py index dc992cfd0aed..69bab60f6803 100644 --- a/src/transformers/models/deprecated/realm/modeling_realm.py +++ b/src/transformers/models/deprecated/realm/modeling_realm.py @@ -57,6 +57,7 @@ def __init__(self, config): self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout_prob) # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) @@ -96,18 +97,18 @@ def forward( if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - embeddings = inputs_embeds + token_type_embeddings - - position_embeddings = self.position_embeddings(position_ids) - embeddings += position_embeddings + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class RealmSelfAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, position_embedding_type=None): super().__init__() if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): raise ValueError( @@ -124,6 +125,12 @@ def __init__(self, config): self.value = nn.Linear(config.hidden_size, self.all_head_size) self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) self.is_decoder = config.is_decoder @@ -170,6 +177,7 @@ def forward( query_layer = self.transpose_for_scores(mixed_query_layer) + use_cache = past_key_values is not None if self.is_decoder: # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. # Further calls to cross_attention layer can then reuse all cross-attention @@ -183,6 +191,28 @@ def forward( # Take the dot product between "query" and "key" to get the raw attention scores. attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in RealmModel forward() function) @@ -232,9 +262,11 @@ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> to class RealmAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, position_embedding_type=None): super().__init__() - self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation](config) + self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) self.output = RealmSelfOutput(config) self.pruned_heads = set() @@ -321,7 +353,7 @@ def __init__(self, config): if self.add_cross_attention: if not self.is_decoder: raise ValueError(f"{self} should be used as a decoder model if cross attention is added") - self.crossattention = RealmAttention(config) + self.crossattention = RealmAttention(config, position_embedding_type="absolute") self.intermediate = RealmIntermediate(config) self.output = RealmOutput(config) diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 973e17c6ade6..d635e90094d3 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -22,6 +22,7 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import ( BaseModelOutputWithCrossAttentions, @@ -33,11 +34,15 @@ from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available, logging from ...utils.generic import OutputRecorder, check_model_inputs from .configuration_esm import EsmConfig +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + logger = logging.get_logger(__name__) @@ -692,25 +697,17 @@ def forward( position_ids=position_ids, ) - if self.config._attn_implementation != "flash_attention_2": - batch_size, seq_length = inputs_embeds.shape[:-1] - if attention_mask is None: - attention_mask = torch.ones(((batch_size, seq_length)), device=inputs_embeds.device) - - attention_mask: torch.Tensor = self.get_extended_attention_mask( - attention_mask, input_shape=(batch_size, seq_length) - ) + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) - # If a 2D or 3D attention mask is provided for the cross-attention - # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder and encoder_hidden_states is not None: - encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() - encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) - if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(encoder_hidden_shape, device=inputs_embeds.device) - encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) - else: - encoder_extended_attention_mask = None + encoder_attention_mask = self._update_cross_attn_mask( + encoder_hidden_states, + encoder_attention_mask, + inputs_embeds.shape[:2], + inputs_embeds, + ) # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -724,7 +721,7 @@ def forward( attention_mask=attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, - encoder_attention_mask=encoder_extended_attention_mask, + encoder_attention_mask=encoder_attention_mask, **kwargs, ) sequence_output = encoder_outputs[0] @@ -735,6 +732,65 @@ def forward( pooler_output=pooled_output, ) + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( + self, + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if "flash" in self.config._attn_implementation: + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + return attention_mask + + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask + def _update_cross_attn_mask( + self, + encoder_hidden_states: Union[torch.Tensor, None], + encoder_attention_mask: Union[torch.Tensor, None], + input_shape: torch.Size, + inputs_embeds: torch.Tensor, + ): + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if "flash" in self.config._attn_implementation: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + elif self.config._attn_implementation == "flex_attention": + if isinstance(encoder_attention_mask, torch.Tensor): + encoder_attention_mask = make_flex_block_causal_mask( + encoder_attention_mask, + query_length=input_shape[-1], + is_causal=False, + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + return encoder_attention_mask + def predict_contacts(self, tokens, attention_mask): attns = self(tokens, attention_mask=attention_mask, return_dict=True, output_attentions=True).attentions attns = torch.stack(attns, dim=1) # Matches the original model layout diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index c3fb683b7a95..2e29fadab83f 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -20,18 +20,18 @@ # limitations under the License. import math -import warnings from dataclasses import dataclass from typing import Callable, Optional, Union import torch -from torch import Tensor, nn +from torch import nn from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import ( BaseModelOutputWithCrossAttentions, @@ -41,15 +41,19 @@ ModelOutput, ) from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update -from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, ModuleUtilsMixin, PreTrainedModel, get_parameter_dtype +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer -from ...utils import TransformersKwargs, auto_docstring, can_return_tuple +from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_torch_flex_attn_available from ...utils.deprecation import deprecate_kwarg from ...utils.generic import OutputRecorder, check_model_inputs from .configuration_evolla import EvollaConfig, SaProtConfig +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + def create_position_ids_from_input_ids(input_ids, padding_idx): """ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols @@ -271,6 +275,8 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cros self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) + self.dropout = config.attention_probs_dropout_prob + self.rotary_embeddings = None self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" @@ -538,6 +544,7 @@ class EvollaSaProtPreTrainedModel(PreTrainedModel): _no_split_modules = ["EvollaSaProtLayer"] _supports_flash_attn = True _supports_sdpa = True + _supports_flex_attn = True _supports_attention_backend = True _can_record_outputs = { @@ -589,6 +596,7 @@ def forward( self, input_ids: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor] = None, + **kwargs, ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -596,10 +604,14 @@ def forward( device = input_ids.device if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length)), device=device) - inputs_embeds = self.embeddings(input_ids=input_ids, attention_mask=attention_mask) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - encoder_outputs = self.encoder(inputs_embeds, attention_mask=extended_attention_mask) + + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) + + encoder_outputs = self.encoder(inputs_embeds, attention_mask=attention_mask, **kwargs) sequence_output = encoder_outputs[0] return BaseModelOutputWithPoolingAndCrossAttentions( @@ -609,61 +621,28 @@ def forward( cross_attentions=encoder_outputs.cross_attentions, ) - def get_extended_attention_mask( + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( self, - attention_mask: Tensor, - input_shape: tuple[int], - device: Optional[torch.device] = None, - dtype: Optional[torch.dtype] = None, - ) -> Tensor: - """ - Makes broadcastable attention and causal masks so that future and masked tokens are ignored. - - Arguments: - attention_mask (`torch.Tensor`): - Mask with ones indicating tokens to attend to, zeros for tokens to ignore. - input_shape (`Tuple[int]`): - The shape of the input to the model. - - Returns: - `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. - """ - if dtype is None: - dtype = get_parameter_dtype(self) - - if not (attention_mask.dim() == 2 and self.config.is_decoder): - # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` - if device is not None: - warnings.warn( - "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning - ) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - if attention_mask.dim() == 3: - extended_attention_mask = attention_mask[:, None, :, :] - elif attention_mask.dim() == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder: - extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( - input_shape, attention_mask, device - ) + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if "flash" in self.config._attn_implementation: + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) else: - extended_attention_mask = attention_mask[:, None, None, :] - else: - raise ValueError( - f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" - ) + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min - return extended_attention_mask + return attention_mask class EvollaSequenceCompressorAttention(nn.Module): @@ -1320,9 +1299,9 @@ class EvollaPreTrainedModel(PreTrainedModel): "EvollaSequenceAlignerCrossAttention", ] _skip_keys_device_placement = ["past_key_values"] - _supports_flash_attn = False # see dependency on `EvollaSaProtProteinEncoder` + _supports_flash_attn = False # see dependency on `EvollaSequenceCompressorResampler` _supports_sdpa = True - _supports_flex_attn = False # see dependency on `EvollaSaProtProteinEncoder` + _supports_flex_attn = False # see dependency on `EvollaSequenceCompressorResampler` _can_compile_fullgraph = True _supports_attention_backend = False diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index cdbfacbe5c01..65cd9ea16289 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -13,26 +13,27 @@ # See the License for the specific language governing permissions and # limitations under the License. -import warnings from dataclasses import dataclass from typing import Optional, Union import torch -from torch import Tensor, nn +from torch import nn from ...cache_utils import Cache, DynamicCache from ...generation import GenerationMixin from ...masking_utils import create_causal_mask +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa from ...modeling_outputs import ( BaseModelOutputWithPast, BaseModelOutputWithPoolingAndCrossAttentions, CausalLMOutputWithPast, ModelOutput, ) -from ...modeling_utils import ModuleUtilsMixin, PreTrainedModel, get_parameter_dtype +from ...modeling_utils import PreTrainedModel from ...utils import ( auto_docstring, can_return_tuple, + is_torch_flex_attn_available, logging, ) from ...utils.deprecation import deprecate_kwarg @@ -59,6 +60,10 @@ from .configuration_evolla import EvollaConfig, SaProtConfig +if is_torch_flex_attn_available(): + from ...integrations.flex_attention import make_flex_block_causal_mask + + logger = logging.get_logger(__name__) @@ -144,6 +149,8 @@ def __init__(self, config, position_embedding_type=None, layer_idx=None, is_cros self.key = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size) + self.dropout = config.attention_probs_dropout_prob + self.rotary_embeddings = None self.position_embedding_type = position_embedding_type or getattr( config, "position_embedding_type", "absolute" @@ -191,6 +198,7 @@ class EvollaSaProtPreTrainedModel(PreTrainedModel): _no_split_modules = ["EvollaSaProtLayer"] _supports_flash_attn = True _supports_sdpa = True + _supports_flex_attn = True _supports_attention_backend = True _can_record_outputs = { @@ -242,6 +250,7 @@ def forward( self, input_ids: Optional[torch.Tensor], attention_mask: Optional[torch.Tensor] = None, + **kwargs, ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: input_shape = input_ids.size() batch_size, seq_length = input_shape @@ -249,10 +258,14 @@ def forward( device = input_ids.device if attention_mask is None: attention_mask = torch.ones(((batch_size, seq_length)), device=device) - inputs_embeds = self.embeddings(input_ids=input_ids, attention_mask=attention_mask) - extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) - encoder_outputs = self.encoder(inputs_embeds, attention_mask=extended_attention_mask) + + attention_mask = self._update_full_mask( + attention_mask, + inputs_embeds, + ) + + encoder_outputs = self.encoder(inputs_embeds, attention_mask=attention_mask, **kwargs) sequence_output = encoder_outputs[0] return BaseModelOutputWithPoolingAndCrossAttentions( @@ -262,61 +275,28 @@ def forward( cross_attentions=encoder_outputs.cross_attentions, ) - def get_extended_attention_mask( + # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask + def _update_full_mask( self, - attention_mask: Tensor, - input_shape: tuple[int], - device: Optional[torch.device] = None, - dtype: Optional[torch.dtype] = None, - ) -> Tensor: - """ - Makes broadcastable attention and causal masks so that future and masked tokens are ignored. - - Arguments: - attention_mask (`torch.Tensor`): - Mask with ones indicating tokens to attend to, zeros for tokens to ignore. - input_shape (`Tuple[int]`): - The shape of the input to the model. - - Returns: - `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. - """ - if dtype is None: - dtype = get_parameter_dtype(self) - - if not (attention_mask.dim() == 2 and self.config.is_decoder): - # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` - if device is not None: - warnings.warn( - "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning - ) - # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] - # ourselves in which case we just need to make it broadcastable to all heads. - if attention_mask.dim() == 3: - extended_attention_mask = attention_mask[:, None, :, :] - elif attention_mask.dim() == 2: - # Provided a padding mask of dimensions [batch_size, seq_length] - # - if the model is a decoder, apply a causal mask in addition to the padding mask - # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder: - extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( - input_shape, attention_mask, device - ) + attention_mask: Union[torch.Tensor, None], + inputs_embeds: torch.Tensor, + ): + if attention_mask is not None: + if "flash" in self.config._attn_implementation: + attention_mask = attention_mask if 0 in attention_mask else None + elif self.config._attn_implementation == "sdpa": + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + elif self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False) else: - extended_attention_mask = attention_mask[:, None, None, :] - else: - raise ValueError( - f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" - ) + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and the dtype's smallest value for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min - return extended_attention_mask + return attention_mask class EvollaSequenceCompressorAttention(nn.Module): @@ -782,8 +762,8 @@ def forward( class EvollaPreTrainedModel(LlamaPreTrainedModel): - _supports_flash_attn = False # see dependency on `EvollaSaProtProteinEncoder` - _supports_flex_attn = False # see dependency on `EvollaSaProtProteinEncoder` + _supports_flash_attn = False # see dependency on `EvollaSequenceCompressorResampler` + _supports_flex_attn = False # see dependency on `EvollaSequenceCompressorResampler` _supports_attention_backend = False _no_split_modules = [ "EvollaDecoderLayer", From c62643181d8a514668ab9744f247b786f9dd24d8 Mon Sep 17 00:00:00 2001 From: vasqu Date: Tue, 30 Sep 2025 17:09:51 +0200 Subject: [PATCH 5/5] oops --- src/transformers/models/esm/modeling_esm.py | 4 ---- src/transformers/models/evolla/modeling_evolla.py | 2 -- src/transformers/models/evolla/modular_evolla.py | 2 -- 3 files changed, 8 deletions(-) diff --git a/src/transformers/models/esm/modeling_esm.py b/src/transformers/models/esm/modeling_esm.py index 91740a60c942..6be0469b5cbe 100755 --- a/src/transformers/models/esm/modeling_esm.py +++ b/src/transformers/models/esm/modeling_esm.py @@ -720,8 +720,6 @@ def _update_full_mask( if "flash" in self.config._attn_implementation: attention_mask = attention_mask if 0 in attention_mask else None elif self.config._attn_implementation == "sdpa": - # output_attentions=True & head_mask can not be supported when using SDPA, fall back to - # the manual implementation that requires a 4D causal mask in all cases. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) elif self.config._attn_implementation == "flex_attention": @@ -746,8 +744,6 @@ def _update_cross_attn_mask( if "flash" in self.config._attn_implementation: encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None elif self.config._attn_implementation == "sdpa": - # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on - # the manual implementation that requires a 4D causal mask in all cases. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( encoder_attention_mask, diff --git a/src/transformers/models/evolla/modeling_evolla.py b/src/transformers/models/evolla/modeling_evolla.py index 82cee79d93e9..d1d3d8e4e90a 100644 --- a/src/transformers/models/evolla/modeling_evolla.py +++ b/src/transformers/models/evolla/modeling_evolla.py @@ -617,8 +617,6 @@ def _update_full_mask( if "flash" in self.config._attn_implementation: attention_mask = attention_mask if 0 in attention_mask else None elif self.config._attn_implementation == "sdpa": - # output_attentions=True & head_mask can not be supported when using SDPA, fall back to - # the manual implementation that requires a 4D causal mask in all cases. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) elif self.config._attn_implementation == "flex_attention": diff --git a/src/transformers/models/evolla/modular_evolla.py b/src/transformers/models/evolla/modular_evolla.py index 65cd9ea16289..b69b27dbf26a 100644 --- a/src/transformers/models/evolla/modular_evolla.py +++ b/src/transformers/models/evolla/modular_evolla.py @@ -285,8 +285,6 @@ def _update_full_mask( if "flash" in self.config._attn_implementation: attention_mask = attention_mask if 0 in attention_mask else None elif self.config._attn_implementation == "sdpa": - # output_attentions=True & head_mask can not be supported when using SDPA, fall back to - # the manual implementation that requires a 4D causal mask in all cases. # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) elif self.config._attn_implementation == "flex_attention":