Skip to content

Commit 45a9988

Browse files
authored
Merge pull request #505 from komaksym/add_new_q_mixed_precision_training
Add new problem: Mixed Precision Training
2 parents 6502a6a + 58a0e36 commit 45a9988

File tree

7 files changed

+157
-0
lines changed

7 files changed

+157
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Write a Python class to implement Mixed Precision Training that uses both float32 and float16 data types to optimize memory usage and speed. Your class should have an `__init__(self, loss_scale=1024.0)` method to initialize with loss scaling factor. Implement `forward(self, weights, inputs, targets)` to perform forward pass with float16 computation and return Mean Squared Error (MSE) loss (scaled) in float32, and `backward(self, gradients)` to unscale gradients and check for overflow. Use float16 for computations but float32 for gradient accumulation. Return gradients as float32 and set them to zero if overflow is detected. Only use NumPy.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{
2+
"input": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")",
3+
"output": "Loss: 665.0000\nLoss dtype: float\nGradients: [0.5 -0.25]\nGrad dtype: float32",
4+
"reasoning": "Forward pass converts inputs to float16, computes loss, then scales and returns as Python float (float32). Backward converts gradients to float32 and unscales. Final gradients must be float32 type."
5+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
# **Mixed Precision Training**
2+
## **1. Definition**
3+
Mixed Precision Training is a **deep learning optimization technique** that uses both **float16** (half precision) and **float32** (single precision) data types during training to reduce memory usage and increase training speed while maintaining model accuracy.
4+
The technique works by:
5+
- **Using float16 for forward pass computations** to save memory and increase speed
6+
- **Using float32 for gradient accumulation** to maintain numerical precision
7+
- **Applying loss scaling** to prevent gradient underflow in float16
8+
---
9+
## **2. Key Components**
10+
### **Mean Squared Error (MSE) Loss**
11+
The loss function must be computed as Mean Squared Error:
12+
$$
13+
\text{MSE} = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2
14+
$$
15+
where $y_i$ is the target and $\hat{y}_i$ is the prediction for sample $i$.
16+
17+
### **Loss Scaling**
18+
To prevent gradient underflow in float16, gradients are scaled up during the forward pass:
19+
$$
20+
\text{scaled\_loss} = \text{MSE} \times \text{scale\_factor}
21+
$$
22+
Then unscaled during backward pass:
23+
$$
24+
\text{gradient} = \frac{\text{scaled\_gradient}}{\text{scale\_factor}}
25+
$$
26+
### **Overflow Detection**
27+
Check for invalid gradients (NaN or Inf) that indicate numerical overflow:
28+
$$
29+
\text{overflow} = \text{any}(\text{isnan}(\text{gradients}) \text{ or } \text{isinf}(\text{gradients}))
30+
$$
31+
---
32+
## **3. Precision Usage**
33+
- **float16**: Forward pass computations, activations, temporary calculations
34+
- **float32**: Gradient accumulation, parameter updates, loss scaling
35+
- **Automatic casting**: Convert between precisions as needed
36+
- **Loss computation**: Use MSE as the loss function before scaling
37+
---
38+
## **4. Benefits and Applications**
39+
- **Memory Efficiency**: Reduces memory usage by ~50% for activations
40+
- **Speed Improvement**: Faster computation on modern GPUs with Tensor Cores
41+
- **Training Stability**: Loss scaling prevents gradient underflow
42+
- **Model Accuracy**: Maintains comparable accuracy to full precision training
43+
Common in training large neural networks where memory is a constraint and speed is critical.
44+
---
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{
2+
"id": "160",
3+
"title": "Mixed Precision Training",
4+
"difficulty": "medium",
5+
"category": "Machine Learning",
6+
"video": "",
7+
"likes": "0",
8+
"dislikes": "0",
9+
"contributor": [
10+
{
11+
"profile_link": "https://github.com/komaksym",
12+
"name": "komaksym"
13+
}
14+
]
15+
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import numpy as np
2+
3+
4+
class MixedPrecision:
5+
def __init__(self, loss_scale=1024.0):
6+
self.loss_scale = loss_scale
7+
8+
def forward(self, weights, inputs, targets):
9+
# Convert ALL inputs to float16 for computation (regardless of input dtype)
10+
weights_fp16 = weights.astype(np.float16)
11+
inputs_fp16 = inputs.astype(np.float16)
12+
targets_fp16 = targets.astype(np.float16)
13+
14+
# Simple forward pass: linear model + MSE loss
15+
predictions = np.dot(inputs_fp16, weights_fp16)
16+
loss = np.mean((targets_fp16 - predictions) ** 2)
17+
18+
# Scale loss and convert back to float32
19+
scaled_loss = loss.astype(np.float32) * self.loss_scale
20+
21+
return scaled_loss
22+
23+
def backward(self, gradients):
24+
# Convert gradients to float32 for precision (regardless of input dtype)
25+
gradients_fp32 = gradients.astype(np.float32)
26+
27+
# Check for overflow (NaN or Inf)
28+
overflow = np.any(np.isnan(gradients_fp32)) or np.any(np.isinf(gradients_fp32))
29+
30+
if overflow:
31+
# Return zero gradients if overflow detected (must be float32)
32+
return np.zeros_like(gradients_fp32, dtype=np.float32)
33+
34+
# Unscale gradients (ensure result is float32)
35+
unscaled_gradients = gradients_fp32 / self.loss_scale
36+
return unscaled_gradients.astype(np.float32)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
import numpy as np
2+
3+
class MixedPrecision:
4+
def __init__(self, loss_scale=1024.0):
5+
# Initialize loss scaling factor
6+
pass
7+
8+
def forward(self, weights, inputs, targets):
9+
# Perform forward pass with float16, return scaled loss as float32
10+
pass
11+
12+
def backward(self, gradients):
13+
# Unscale gradients and check for overflow, return as float32
14+
pass
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
[
2+
{
3+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\nweights = np.array([0.5, -0.3], dtype=np.float32)\ninputs = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)\ntargets = np.array([1.0, 0.0], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.4f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
4+
"expected_output": "Loss: 665.0000\nLoss dtype: float"
5+
},
6+
{
7+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=1024.0)\ngrads = np.array([512.0, -256.0], dtype=np.float32)\nresult = mp.backward(grads)\nprint(f\"Gradients: {result}\")\nprint(f\"Grad dtype: {result.dtype}\")",
8+
"expected_output": "Gradients: [ 0.5 -0.25]\nGrad dtype: float32"
9+
},
10+
{
11+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\nweights = np.array([1.0, 0.5], dtype=np.float64)\ninputs = np.array([[2.0, 1.0]], dtype=np.float64)\ntargets = np.array([3.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
12+
"expected_output": "Loss: 128.0\nLoss dtype: float"
13+
},
14+
{
15+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=512.0)\ngrads = np.array([1024.0, 512.0], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
16+
"expected_output": "Gradients: [2 1]\nGrad dtype: float32"
17+
},
18+
{
19+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\nweights = np.array([0.1, 0.2], dtype=np.float32)\ninputs = np.array([[1.0, 1.0]], dtype=np.float32)\ntargets = np.array([0.5], dtype=np.float32)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
20+
"expected_output": "Loss: 4.0\nLoss dtype: float"
21+
},
22+
{
23+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=100.0)\ngrads = np.array([200.0, 100.0], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f} {result[1]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
24+
"expected_output": "Gradients: [2 1]\nGrad dtype: float32"
25+
},
26+
{
27+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\nweights = np.array([0.25], dtype=np.float64)\ninputs = np.array([[4.0]], dtype=np.float64)\ntargets = np.array([2.0], dtype=np.float64)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
28+
"expected_output": "Loss: 2048.0\nLoss dtype: float"
29+
},
30+
{
31+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=2048.0)\ngrads = np.array([np.nan], dtype=np.float16)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
32+
"expected_output": "Gradients: [0]\nGrad dtype: float32"
33+
},
34+
{
35+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\nweights = np.array([1.0], dtype=np.float16)\ninputs = np.array([[2.0]], dtype=np.float16)\ntargets = np.array([3.0], dtype=np.float16)\nloss = mp.forward(weights, inputs, targets)\nprint(f\"Loss: {loss:.1f}\")\nprint(f\"Loss dtype: {type(loss).__name__}\")",
36+
"expected_output": "Loss: 256.0\nLoss dtype: float"
37+
},
38+
{
39+
"test": "import numpy as np\nmp = MixedPrecision(loss_scale=256.0)\ngrads = np.array([np.inf], dtype=np.float64)\nresult = mp.backward(grads)\nprint(f\"Gradients: [{result[0]:.0f}]\")\nprint(f\"Grad dtype: {result.dtype}\")",
40+
"expected_output": "Gradients: [0]\nGrad dtype: float32"
41+
}
42+
]

0 commit comments

Comments
 (0)