Merge pull request #505 from komaksym/add_new_q_mixed_precision_training

moe18 · web-flow · commit 45a99886539e · 2025-09-15T09:59:39.000-04:00
Add new problem: Mixed Precision Training
diff --git a/questions/160_mixed_precision_training/description.md b/questions/160_mixed_precision_training/description.md
@@ -0,0 +1 @@
+Write a Python class to implement Mixed Precision Training that uses both float32 and float16 data types to optimize memory usage and speed. Your class should have an `__init__(self, loss_scale=1024.0)` method to initialize with loss scaling factor. Implement `forward(self, weights, inputs, targets)` to perform forward pass with float16 computation and return Mean Squared Error (MSE) loss (scaled) in float32, and `backward(self, gradients)` to unscale gradients and check for overflow. Use float16 for computations but float32 for gradient accumulation. Return gradients as float32 and set them to zero if overflow is detected. Only use NumPy.
diff --git a/questions/160_mixed_precision_training/example.json b/questions/160_mixed_precision_training/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")",
+  "output": "Loss: 665.0000\nLoss dtype: float\nGradients: [0.5 -0.25]\nGrad dtype: float32",
+  "reasoning": "Forward pass converts inputs to float16, computes loss, then scales and returns as Python float (float32). Backward converts gradients to float32 and unscales. Final gradients must be float32 type."
+}
diff --git a/questions/160_mixed_precision_training/learn.md b/questions/160_mixed_precision_training/learn.md
@@ -0,0 +1,44 @@
+# **Mixed Precision Training**
+## **1. Definition**
+Mixed Precision Training is a **deep learning optimization technique** that uses both **float16** (half precision) and **float32** (single precision) data types during training to reduce memory usage and increase training speed while maintaining model accuracy.
+The technique works by:
+- **Using float16 for forward pass computations** to save memory and increase speed
+- **Using float32 for gradient accumulation** to maintain numerical precision
+- **Applying loss scaling** to prevent gradient underflow in float16
+---
+## **2. Key Components**
+### **Mean Squared Error (MSE) Loss**
+The loss function must be computed as Mean Squared Error:
+$$
+\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2
+$$
+where $y_i$ is the target and $\hat{y}_i$ is the prediction for sample $i$.
+
+### **Loss Scaling**
+To prevent gradient underflow in float16, gradients are scaled up during the forward pass:
+$$
+\text{scaled\_loss} = \text{MSE} \times \text{scale\_factor}
+$$
+Then unscaled during backward pass:
+$$
+\text{gradient} = \frac{\text{scaled\_gradient}}{\text{scale\_factor}}
+$$
+### **Overflow Detection**
+Check for invalid gradients (NaN or Inf) that indicate numerical overflow:
+$$
+\text{overflow} = \text{any}(\text{isnan}(\text{gradients}) \text{ or } \text{isinf}(\text{gradients}))
+$$
+---
+## **3. Precision Usage**
+- **float16**: Forward pass computations, activations, temporary calculations
+- **float32**: Gradient accumulation, parameter updates, loss scaling
+- **Automatic casting**: Convert between precisions as needed
+- **Loss computation**: Use MSE as the loss function before scaling
+---
+## **4. Benefits and Applications**
+- **Memory Efficiency**: Reduces memory usage by ~50% for activations
+- **Speed Improvement**: Faster computation on modern GPUs with Tensor Cores
+- **Training Stability**: Loss scaling prevents gradient underflow
+- **Model Accuracy**: Maintains comparable accuracy to full precision training
+Common in training large neural networks where memory is a constraint and speed is critical.
+---
diff --git a/questions/160_mixed_precision_training/meta.json b/questions/160_mixed_precision_training/meta.json
@@ -0,0 +1,15 @@
+{
+  "id": "160",
+  "title": "Mixed Precision Training",
+  "difficulty": "medium",
+  "category": "Machine Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/komaksym",
+      "name": "komaksym"
+    }
+  ]
+}
diff --git a/questions/160_mixed_precision_training/solution.py b/questions/160_mixed_precision_training/solution.py
@@ -0,0 +1,36 @@
+import numpy as np
+
+
+class MixedPrecision:
+    def __init__(self, loss_scale=1024.0):
+        self.loss_scale = loss_scale
+
+    def forward(self, weights, inputs, targets):
+        # Convert ALL inputs to float16 for computation (regardless of input dtype)
+        weights_fp16 = weights.astype(np.float16)
+        inputs_fp16 = inputs.astype(np.float16)
+        targets_fp16 = targets.astype(np.float16)
+
+        # Simple forward pass: linear model + MSE loss
+        predictions = np.dot(inputs_fp16, weights_fp16)
+        loss = np.mean((targets_fp16 - predictions) ** 2)
+
+        # Scale loss and convert back to float32
+        scaled_loss = loss.astype(np.float32) * self.loss_scale
+
+        return scaled_loss
+
+    def backward(self, gradients):
+        # Convert gradients to float32 for precision (regardless of input dtype)
+        gradients_fp32 = gradients.astype(np.float32)
+
+        # Check for overflow (NaN or Inf)
+        overflow = np.any(np.isnan(gradients_fp32)) or np.any(np.isinf(gradients_fp32))
+
+        if overflow:
+            # Return zero gradients if overflow detected (must be float32)
+            return np.zeros_like(gradients_fp32, dtype=np.float32)
+
+        # Unscale gradients (ensure result is float32)
+        unscaled_gradients = gradients_fp32 / self.loss_scale
+        return unscaled_gradients.astype(np.float32)
diff --git a/questions/160_mixed_precision_training/starter_code.py b/questions/160_mixed_precision_training/starter_code.py
@@ -0,0 +1,14 @@
+import numpy as np
+
+class MixedPrecision:
+    def __init__(self, loss_scale=1024.0):
+        # Initialize loss scaling factor
+        pass
+    
+    def forward(self, weights, inputs, targets):
+        # Perform forward pass with float16, return scaled loss as float32
+        pass
+    
+    def backward(self, gradients):
+        # Unscale gradients and check for overflow, return as float32
+        pass
diff --git a/questions/160_mixed_precision_training/tests.json b/questions/160_mixed_precision_training/tests.json
@@ -0,0 +1,42 @@
+[
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
+        "expected_output": "Loss: 665.0000\nLoss dtype: float"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")",
+        "expected_output": "Gradients: [ 0.5  -0.25]\nGrad dtype: float32"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\nweights = np.array([1.0, 0.5], dtype=np.float64)\ninputs = np.array([[2.0, 1.0]], dtype=np.float64)\ntargets = np.array([3.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
+        "expected_output": "Loss: 128.0\nLoss dtype: float"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\ngrads = np.array([1024.0, 512.0], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
+        "expected_output": "Gradients: [2 1]\nGrad dtype: float32"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\nweights = np.array([0.1, 0.2], dtype=np.float32)\ninputs = np.array([[1.0, 1.0]], dtype=np.float32)\ntargets = np.array([0.5], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
+        "expected_output": "Loss: 4.0\nLoss dtype: float"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\ngrads = np.array([200.0, 100.0], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
+        "expected_output": "Gradients: [2 1]\nGrad dtype: float32"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\nweights = np.array([0.25], dtype=np.float64)\ninputs = np.array([[4.0]], dtype=np.float64)\ntargets = np.array([2.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
+        "expected_output": "Loss: 2048.0\nLoss dtype: float"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\ngrads = np.array([np.nan], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
+        "expected_output": "Gradients: [0]\nGrad dtype: float32"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\nweights = np.array([1.0], dtype=np.float16)\ninputs = np.array([[2.0]], dtype=np.float16)\ntargets = np.array([3.0], dtype=np.float16)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
+        "expected_output": "Loss: 256.0\nLoss dtype: float"
+    },
+    {
+        "test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\ngrads = np.array([np.inf], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
+        "expected_output": "Gradients: [0]\nGrad dtype: float32"
+    }
+]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+Write a Python class to implement Mixed Precision Training that uses both float32 and float16 data types to optimize memory usage and speed. Your class should have an `__init__(self, loss_scale=1024.0)` method to initialize with loss scaling factor. Implement `forward(self, weights, inputs, targets)` to perform forward pass with float16 computation and return Mean Squared Error (MSE) loss (scaled) in float32, and `backward(self, gradients)` to unscale gradients and check for overflow. Use float16 for computations but float32 for gradient accumulation. Return gradients as float32 and set them to zero if overflow is detected. Only use NumPy.