Add final normalization step for non-NVIDIA devices in Qwen2 Q8_0 FFN layers.

orionpapadakis · orionpapadakis · commit 8f637cd5ede1 · 2025-12-11T12:09:47.000+02:00
diff --git a/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/q8_0/Qwen2Q8_0FFNLayers.java b/src/main/java/org/beehive/gpullama3/tornadovm/layers/type/q8_0/Qwen2Q8_0FFNLayers.java
@@ -268,6 +268,16 @@ TaskGraph setupSingleQwen2Q8_0FFNLayer(Qwen2TornadoWeights weights, int layerInd
                 config.rmsNormEps(),          // epsilon
                 qwen2State.localSize);        // local memory size
 
+        // Final normalization (non-NVIDIA only)
+        if (shouldUseFinalNormalization()) {
+            unifiedLayer.task("ffn_rms_finalize",
+                    TransformerComputeKernelsLayered::reductionFinalNormalization,
+                    context,
+                    qwen2State.tempFFN,       // scale factor (in/out)
+                    config.dim(),             // dimension
+                    config.rmsNormEps());     // epsilon
+        }
+
         // Fused RMS Apply + Gate/Up Projection + SiLU + GLU
         // (Replaces mapContextFFN + fusedFeedForwardWithSiLUAndGLUActivation)
         unifiedLayer.task("rms_ffn_gate_up",