From 2015fa216421011f849fdfc5db54f2950b702ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20B=C4=83lt=C4=83re=C8=9Bu?= Date: Mon, 17 Nov 2025 15:56:19 +0100 Subject: [PATCH 1/2] Update problem 53, clarify starter code --- .../learn.md | 2 +- .../meta.json | 4 ++ .../solution.py | 20 ++++++--- .../starter_code.py | 45 +++++++++++++++++-- 4 files changed, 60 insertions(+), 11 deletions(-) diff --git a/questions/53_implement-self-attention-mechanism/learn.md b/questions/53_implement-self-attention-mechanism/learn.md index e8922dd1..fd3bdef5 100644 --- a/questions/53_implement-self-attention-mechanism/learn.md +++ b/questions/53_implement-self-attention-mechanism/learn.md @@ -34,7 +34,7 @@ $$ \text{Attention}(Q, K, V) = \text{softmax} \left( \frac{Q K^T}{\sqrt{d_k}} \right) V $$ -where $d_k$ is the dimensionality of the key vectors. +where $d_k$ is the dimensionality of the key vectors (as in the amount of features used to describe each token). --- diff --git a/questions/53_implement-self-attention-mechanism/meta.json b/questions/53_implement-self-attention-mechanism/meta.json index 0a0cbe15..fcc2c90b 100644 --- a/questions/53_implement-self-attention-mechanism/meta.json +++ b/questions/53_implement-self-attention-mechanism/meta.json @@ -10,6 +10,10 @@ { "profile_link": "https://github.com/Jayanth-vardhan", "name": "Jayanth-vardhan" + }, + { + "profile_link": "https://github.com/ana-baltaretu", + "name": "anisca22" } ] } \ No newline at end of file diff --git a/questions/53_implement-self-attention-mechanism/solution.py b/questions/53_implement-self-attention-mechanism/solution.py index 2d727c7a..3f5ba27c 100644 --- a/questions/53_implement-self-attention-mechanism/solution.py +++ b/questions/53_implement-self-attention-mechanism/solution.py @@ -1,14 +1,20 @@ import numpy as np -def compute_qkv(X, W_q, W_k, W_v): - Q = np.dot(X, W_q) - K = np.dot(X, W_k) - V = np.dot(X, W_v) + +def compute_qkv(x: np.ndarray, W_q: np.ndarray, W_k: np.ndarray, W_v: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + Q = np.dot(x, W_q) + K = np.dot(x, W_k) + V = np.dot(x, W_v) return Q, K, V -def self_attention(Q, K, V): - d_k = Q.shape[1] + +def softmax(x: np.ndarray, axis: int = 1) -> np.ndarray: + return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True) + + +def self_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray) -> np.ndarray: + d_k = K.shape[1] scores = np.matmul(Q, K.T) / np.sqrt(d_k) - attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True) + attention_weights = softmax(scores, axis=1) attention_output = np.matmul(attention_weights, V) return attention_output diff --git a/questions/53_implement-self-attention-mechanism/starter_code.py b/questions/53_implement-self-attention-mechanism/starter_code.py index 230bf496..e018ac53 100644 --- a/questions/53_implement-self-attention-mechanism/starter_code.py +++ b/questions/53_implement-self-attention-mechanism/starter_code.py @@ -1,5 +1,44 @@ import numpy as np -def self_attention(Q, K, V): - - return attention_output + +def compute_qkv(x: np.ndarray, W_q: np.ndarray, W_k: np.ndarray, W_v: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Compute query, key and value matrices from input embeddings (of length dim_in). + + x: (n_tokens, dim_in) input embeddings + W_q: (dim_in, dim_qk) query weights + W_k: (dim_in, dim_qk) key weights + W_v: (dim_in, dim_v) value weights + Returns (Q, K, V) with shapes (n_tokens, dim_qk), (n_tokens, dim_qk), (n_tokens, dim_v) + """ + # TODO: return (Q, K, V) + pass + + +def softmax(x: np.ndarray, axis: int = 1) -> np.ndarray: + """ + Apply softmax along the given axis. + + x: input array + axis: the axis to normalize along + Returns array of same shape where values along `axis` sum to 1 + """ + # TODO: return softmax_output + pass + + +def self_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray) -> np.ndarray: + """ + Compute scaled dot product self attention. + + Q: (n_tokens, dim_qk) queries + K: (n_tokens, dim_qk) keys + V: (n_tokens, dim_v) values + Returns attention output of shape (n_tokens, dim_v) + """ + # TODO: return attention_output + pass + + + + From f76e615bbc0099b6b19d06e74dbf365892eb78fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20B=C4=83lt=C4=83re=C8=9Bu?= Date: Mon, 17 Nov 2025 16:04:54 +0100 Subject: [PATCH 2/2] Update build output --- build/53.json | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/build/53.json b/build/53.json index df0ba76a..c8fb749f 100644 --- a/build/53.json +++ b/build/53.json @@ -10,12 +10,16 @@ { "profile_link": "https://github.com/Jayanth-vardhan", "name": "Jayanth-vardhan" + }, + { + "profile_link": "https://github.com/ana-baltaretu", + "name": "anisca22" } ], "description": "## Task: Implement the Self-Attention Mechanism\n\nYour task is to implement the **self-attention** mechanism, which is a fundamental component of transformer models, widely used in natural language processing and computer vision tasks. The self-attention mechanism allows a model to dynamically focus on different parts of the input sequence when generating a contextualized representation.\n\nYour function should return the self-attention output as a numpy array.\n\n ", - "learn_section": "## Self-Attention Mechanism\n\nThe **self-attention mechanism** is a fundamental concept in **transformer models** and is widely used in **natural language processing (NLP)** and **computer vision (CV)**. It allows models to dynamically weigh different parts of the input sequence, enabling them to capture **long-range dependencies** effectively.\n\n---\n\n### **Understanding Self-Attention**\n\nSelf-attention helps a model determine **which parts of an input sequence are relevant to each other**. Instead of treating every word or token equally, self-attention assigns different weights to different parts of the sequence, allowing the model to capture contextual relationships.\n\nFor example, in machine translation, self-attention allows the model to **focus on relevant words** from the input sentence when generating each word in the output.\n\n---\n\n### **Mathematical Formulation of Self-Attention**\n\nGiven an input sequence $X$, self-attention computes three key components:\n\n1. **Query ($Q$)**: Represents the current token we are processing.\n2. **Key ($K$)**: Represents each token in the sequence.\n3. **Value ($V$)**: Contains the actual token embeddings.\n\nThe Query, Key, and Value matrices are computed as:\n\n$$\nQ = X W_Q, \\quad K = X W_K, \\quad V = X W_V\n$$\n\nwhere $W_Q$, $W_K$, and $W_V$ are learned weight matrices.\n\nThe attention scores are computed using the **scaled dot-product attention**:\n\n$$\n\\text{Attention}(Q, K, V) = \\text{softmax} \\left( \\frac{Q K^T}{\\sqrt{d_k}} \\right) V\n$$\n\nwhere $d_k$ is the dimensionality of the key vectors.\n\n---\n\n### **Why Self-Attention is Powerful?**\n\n- **Captures long-range dependencies**: Unlike RNNs, which process input sequentially, self-attention can relate any word in the sequence to any other word, regardless of distance.\n- **Parallelization**: Since self-attention is computed **simultaneously** across the entire sequence, it is much faster than sequential models like LSTMs.\n- **Contextual Understanding**: Each token is **contextually enriched** by attending to relevant tokens in the sequence.\n\n---\n\n### **Example Calculation**\n\nConsider an input sequence of three tokens:\n\n$$\nX = \\begin{bmatrix} x_1 \\\\ x_2 \\\\ x_3 \\end{bmatrix}\n$$\n\nWe compute $Q$, $K$, and $V$ as:\n\n$$\nQ = X W_Q, \\quad K = X W_K, \\quad V = X W_V\n$$\n\nNext, we compute the attention scores:\n\n$$\nS = \\frac{Q K^T}{\\sqrt{d_k}}\n$$\n\nApplying the softmax function:\n\n$$\nA = \\text{softmax}(S)\n$$\n\nFinally, the weighted sum of values:\n\n$$\n\\text{Output} = A V\n$$\n\n---\n\n### **Applications of Self-Attention**\n\nSelf-attention is widely used in:\n- **Transformer models (e.g., BERT, GPT-3)** for language modeling.\n- **Speech processing models** for transcribing audio.\n- **Vision Transformers (ViTs)** for computer vision tasks.\n- **Recommender systems** for learning item-user relationships.\n\nMastering self-attention is essential for understanding modern deep learning architectures, especially in NLP and computer vision.", - "starter_code": "import numpy as np\n\ndef self_attention(Q, K, V):\n \n\treturn attention_output", - "solution": "import numpy as np\n\ndef compute_qkv(X, W_q, W_k, W_v):\n Q = np.dot(X, W_q)\n K = np.dot(X, W_k)\n V = np.dot(X, W_v)\n return Q, K, V\n\ndef self_attention(Q, K, V):\n d_k = Q.shape[1]\n scores = np.matmul(Q, K.T) / np.sqrt(d_k)\n attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)\n attention_output = np.matmul(attention_weights, V)\n return attention_output", + "learn_section": "## Self-Attention Mechanism\n\nThe **self-attention mechanism** is a fundamental concept in **transformer models** and is widely used in **natural language processing (NLP)** and **computer vision (CV)**. It allows models to dynamically weigh different parts of the input sequence, enabling them to capture **long-range dependencies** effectively.\n\n---\n\n### **Understanding Self-Attention**\n\nSelf-attention helps a model determine **which parts of an input sequence are relevant to each other**. Instead of treating every word or token equally, self-attention assigns different weights to different parts of the sequence, allowing the model to capture contextual relationships.\n\nFor example, in machine translation, self-attention allows the model to **focus on relevant words** from the input sentence when generating each word in the output.\n\n---\n\n### **Mathematical Formulation of Self-Attention**\n\nGiven an input sequence $X$, self-attention computes three key components:\n\n1. **Query ($Q$)**: Represents the current token we are processing.\n2. **Key ($K$)**: Represents each token in the sequence.\n3. **Value ($V$)**: Contains the actual token embeddings.\n\nThe Query, Key, and Value matrices are computed as:\n\n$$\nQ = X W_Q, \\quad K = X W_K, \\quad V = X W_V\n$$\n\nwhere $W_Q$, $W_K$, and $W_V$ are learned weight matrices.\n\nThe attention scores are computed using the **scaled dot-product attention**:\n\n$$\n\\text{Attention}(Q, K, V) = \\text{softmax} \\left( \\frac{Q K^T}{\\sqrt{d_k}} \\right) V\n$$\n\nwhere $d_k$ is the dimensionality of the key vectors (as in the amount of features used to describe each token).\n\n---\n\n### **Why Self-Attention is Powerful?**\n\n- **Captures long-range dependencies**: Unlike RNNs, which process input sequentially, self-attention can relate any word in the sequence to any other word, regardless of distance.\n- **Parallelization**: Since self-attention is computed **simultaneously** across the entire sequence, it is much faster than sequential models like LSTMs.\n- **Contextual Understanding**: Each token is **contextually enriched** by attending to relevant tokens in the sequence.\n\n---\n\n### **Example Calculation**\n\nConsider an input sequence of three tokens:\n\n$$\nX = \\begin{bmatrix} x_1 \\\\ x_2 \\\\ x_3 \\end{bmatrix}\n$$\n\nWe compute $Q$, $K$, and $V$ as:\n\n$$\nQ = X W_Q, \\quad K = X W_K, \\quad V = X W_V\n$$\n\nNext, we compute the attention scores:\n\n$$\nS = \\frac{Q K^T}{\\sqrt{d_k}}\n$$\n\nApplying the softmax function:\n\n$$\nA = \\text{softmax}(S)\n$$\n\nFinally, the weighted sum of values:\n\n$$\n\\text{Output} = A V\n$$\n\n---\n\n### **Applications of Self-Attention**\n\nSelf-attention is widely used in:\n- **Transformer models (e.g., BERT, GPT-3)** for language modeling.\n- **Speech processing models** for transcribing audio.\n- **Vision Transformers (ViTs)** for computer vision tasks.\n- **Recommender systems** for learning item-user relationships.\n\nMastering self-attention is essential for understanding modern deep learning architectures, especially in NLP and computer vision.", + "starter_code": "import numpy as np\n\n\ndef compute_qkv(x: np.ndarray, W_q: np.ndarray, W_k: np.ndarray, W_v: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:\n \"\"\"\n Compute query, key and value matrices from input embeddings (of length dim_in).\n\n x: (n_tokens, dim_in) input embeddings\n W_q: (dim_in, dim_qk) query weights\n W_k: (dim_in, dim_qk) key weights\n W_v: (dim_in, dim_v) value weights\n Returns (Q, K, V) with shapes (n_tokens, dim_qk), (n_tokens, dim_qk), (n_tokens, dim_v)\n \"\"\"\n # TODO: return (Q, K, V)\n pass\n\n\ndef softmax(x: np.ndarray, axis: int = 1) -> np.ndarray:\n \"\"\"\n Apply softmax along the given axis.\n\n x: input array\n axis: the axis to normalize along\n Returns array of same shape where values along `axis` sum to 1\n \"\"\"\n # TODO: return softmax_output\n pass\n\n\ndef self_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray) -> np.ndarray:\n \"\"\"\n Compute scaled dot product self attention.\n\n Q: (n_tokens, dim_qk) queries\n K: (n_tokens, dim_qk) keys\n V: (n_tokens, dim_v) values\n Returns attention output of shape (n_tokens, dim_v)\n \"\"\"\n # TODO: return attention_output\n pass", + "solution": "import numpy as np\n\n\ndef compute_qkv(x: np.ndarray, W_q: np.ndarray, W_k: np.ndarray, W_v: np.ndarray) -> tuple[np.ndarray, np.ndarray, np.ndarray]:\n Q = np.dot(x, W_q)\n K = np.dot(x, W_k)\n V = np.dot(x, W_v)\n return Q, K, V\n\n\ndef softmax(x: np.ndarray, axis: int = 1) -> np.ndarray:\n return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)\n\n\ndef self_attention(Q: np.ndarray, K: np.ndarray, V: np.ndarray) -> np.ndarray:\n d_k = K.shape[1]\n scores = np.matmul(Q, K.T) / np.sqrt(d_k)\n attention_weights = softmax(scores, axis=1)\n attention_output = np.matmul(attention_weights, V)\n return attention_output", "example": { "input": "import numpy as np\n\nX = np.array([[1, 0], [0, 1]])\nW_q = np.array([[1, 0], [0, 1]])\nW_k = np.array([[1, 0], [0, 1]])\nW_v = np.array([[1, 2], [3, 4]])\n\nQ, K, V = compute_qkv(X, W_q, W_k, W_v)\noutput = self_attention(Q, K, V)\n\nprint(output)", "output": "# [[1.660477 2.660477]\n# [2.339523 3.339523]]",