Add support for global scale to scaled_matmul_wrapper

sergey-kozub · sergey-kozub · commit 0c4524acf072 · 2025-11-10T11:33:03.000Z
diff --git a/jax/_src/cudnn/scaled_matmul_stablehlo.py b/jax/_src/cudnn/scaled_matmul_stablehlo.py
@@ -60,14 +60,18 @@ def element_type_to_backend_config_type(dtype):
   return _element_type_to_backend_config_type_mapping[dtype]
 
 
-def _scaled_matmul_impl(a, b, a_scale, b_scale, preferred_element_type):
+def _scaled_matmul_impl(a, b, a_scale, b_scale, global_scale,
+                        preferred_element_type, has_global_scale):
   return _scaled_matmul_p.bind(
-      a, b, a_scale, b_scale, preferred_element_type=preferred_element_type
+      a, b, a_scale, b_scale, global_scale,
+      preferred_element_type=preferred_element_type,
+      has_global_scale=has_global_scale
   )
 
 
 def _scaled_matmul_cuda_lowering(
-    ctx, a, b, a_scales, b_scales, preferred_element_type
+    ctx, a, b, a_scales, b_scales, global_scale, preferred_element_type,
+    has_global_scale
   ):
   lhs_type = ir.RankedTensorType(a.type)
   lhs_shape = lhs_type.shape
@@ -82,6 +86,8 @@ def _scaled_matmul_cuda_lowering(
   result_types = [ir.RankedTensorType.get(result_shape, out_type)]
 
   operands = [a, b, a_scales, b_scales]
+  if has_global_scale:
+    operands.append(global_scale)
   backend_config = {
       "scaled_dot_backend_config": {
           "lhs_batch_dimensions": [0],
@@ -104,7 +110,8 @@ def _scaled_matmul_cuda_lowering(
   return [out.result]
 
 
-def _scaled_matmul_abstract(a, b, a_scale, b_scale, *, preferred_element_type):
+def _scaled_matmul_abstract(a, b, a_scale, b_scale, global_scale,
+                            *, preferred_element_type, has_global_scale):
   batch, non_contracting_lhs, contracting_lhs = a.shape
   _, non_contracting_rhs, _ = b.shape
   output_shape = (batch, non_contracting_lhs, non_contracting_rhs)
@@ -296,7 +303,7 @@ def _scaled_matmul_impl_partition(a, b, a_scale, b_scale):
 
 
 _scaled_matmul_lower = custom_partitioning(
-    _scaled_matmul_impl, static_argnums=(4,)
+    _scaled_matmul_impl, static_argnums=(5, 6)
 )
 
 _scaled_matmul_lower.def_partition(
@@ -306,16 +313,13 @@ def _scaled_matmul_impl_partition(a, b, a_scale, b_scale):
 )
 
 
-def _scaled_matmul_batcher(batched_args, batch_dims, *, preferred_element_type):
-  assert len(batch_dims) == 4
-  assert (
-      batch_dims[0] == batch_dims[1]
-      and batch_dims[0] == batch_dims[2]
-      and batch_dims[0] == batch_dims[3]
-  )
+def _scaled_matmul_batcher(batched_args, batch_dims, *, preferred_element_type,
+                           has_global_scale):
+  assert len(batch_dims) == 5
+  assert len(set(batch_dims[:4])) == 1 and batch_dims[4] is None
   lhs_bdims = batch_dims[0]
   out_bdims = (batch_dims[0],)
-  lhs, rhs, lhs_scales, rhs_scales = batched_args
+  lhs, rhs, lhs_scales, rhs_scales, global_scale = batched_args
   *batch, lhs_non_contracting, contracting = lhs.shape
   *_, _, scales_contracting = lhs_scales.shape
   *_, rhs_non_contracting, _ = rhs.shape
@@ -336,7 +340,9 @@ def _scaled_matmul_batcher(batched_args, batch_dims, *, preferred_element_type):
           rhs,
           lhs_scales,
           rhs_scales,
+          global_scale,
           preferred_element_type=preferred_element_type,
+          has_global_scale=has_global_scale,
       )[0],
       (*batch, lhs_non_contracting, rhs_non_contracting),
   )
@@ -355,17 +361,20 @@ def _scaled_matmul_batcher(batched_args, batch_dims, *, preferred_element_type):
 batching.primitive_batchers[_scaled_matmul_p] = _scaled_matmul_batcher
 
 
-@api.jit(static_argnames=("preferred_element_type",))
+@api.jit(static_argnames=("preferred_element_type", "has_global_scale"))
 def _scaled_matmul(
     lhs: Array,
     rhs: Array,
     lhs_scales: Array,
     rhs_scales: Array,
+    global_scale: Array,
     preferred_element_type: DTypeLike = np.dtype('float32'),
+    has_global_scale: bool = False,
   ) -> Array:
   output = _scaled_matmul_p_wrapper.bind(
-      lhs, rhs, lhs_scales, rhs_scales,
-      preferred_element_type=preferred_element_type
+      lhs, rhs, lhs_scales, rhs_scales, global_scale,
+      preferred_element_type=preferred_element_type,
+      has_global_scale=has_global_scale
   )
   return output[0]
 
@@ -374,7 +383,9 @@ def scaled_matmul_wrapper(
     rhs: Array,
     lhs_scales: Array,
     rhs_scales: Array,
+    global_scale: Array,
     preferred_element_type: DTypeLike = np.dtype('float32'),
+    has_global_scale: bool = False,
 ) -> Array:
     """
     Performs scaled matrix multiplication between two 3D arrays, with scaling
@@ -385,8 +396,10 @@ def scaled_matmul_wrapper(
         rhs (Array): A 3D array of shape (B, N, K).
         lhs_scales (Array): A 3D array of shape (B, M, K_block).
         rhs_scales (Array): A 3D array of shape (B, N, K_block).
+        global_scale (Array): A 0D array (scalar).
         preferred_element_type (DTypeLike, optional): The preferred data type
           for the computation. Defaults to `jnp.float32`.
+        has_global_scale (bool, optional): Whether to use a global scale.
 
     Returns:
         Array: A 3D array of shape (B, M, N) representing the scaled matrix
@@ -416,7 +429,9 @@ def scaled_matmul_wrapper(
         rhs,
         lhs_scales,
         rhs_scales,
+        global_scale,
         preferred_element_type=preferred_element_type,
+        has_global_scale=has_global_scale,
     )
 
     return out
@@ -577,18 +592,17 @@ def scaled_dot_impl(lhs, rhs, dimension_numbers, preferred_element_type,
   lhs_q, lhs_scales = quantize(lhs_3d, lhs_config)
   rhs_q, rhs_scales = quantize(rhs_3d, rhs_config)
 
-  out_dtype = preferred_element_type
-  if configs[0].mode == 'nvfp4':
-    out_dtype = np.float32
+  has_global_scale = configs[0].mode == 'nvfp4'
+  global_scale = jnp.array(
+    configs[0].global_scale * configs[1].global_scale
+    if has_global_scale else 0, dtype=preferred_element_type)
 
   out = scaled_matmul_wrapper(
-      lhs_q, rhs_q, lhs_scales, rhs_scales, preferred_element_type=out_dtype
+      lhs_q, rhs_q, lhs_scales, rhs_scales, global_scale,
+      preferred_element_type=preferred_element_type,
+      has_global_scale=has_global_scale,
   )
 
-  if configs[0].mode == 'nvfp4':
-    out *= (configs[0].global_scale * configs[1].global_scale)
-    out = out.astype(preferred_element_type)
-
   expanded_out_shape = compute_dot_output_shape(
       lhs.shape, rhs.shape, lhs_dn, rhs_dn
   )
@@ -625,7 +639,8 @@ def scaled_dot_general_transpose_lhs(
     y_q, y_scales = quantize(y_3d, y_config)
 
     out = scaled_matmul_wrapper(
-        g_q, y_q, g_scales, y_scales, preferred_element_type
+        g_q, y_q, g_scales, y_scales, jnp.array(0),
+        preferred_element_type, has_global_scale=False
     )
   else:
     out = jnp.matmul(g_3d, jnp.permute_dims(y_3d, (0, 2, 1)), preferred_element_type=preferred_element_type)
diff --git a/jax/_src/nn/functions.py b/jax/_src/nn/functions.py
@@ -1249,6 +1249,7 @@ def scaled_matmul(
     rhs: Array,
     lhs_scales: Array,
     rhs_scales: Array,
+    global_scale: Array | None = None,
     preferred_element_type: DTypeLike = np.float32,
 ) -> Array:
     r"""Scaled matrix multiplication function.
@@ -1269,6 +1270,7 @@ def scaled_matmul(
       rhs (Array): Operand b, shape (B, N, K).
       lhs_scales (Array): Shape (B, M, K_a), where `K % K_a == 0`.
       rhs_scales (Array): Shape (B, N, K_b), where `K % K_b == 0`.
+      global_scale (Array, optional): Scalar scaling factor.
       preferred_element_type (DTypeLike, optional): Defaults to `jnp.float32`.
 
     Returns:
@@ -1348,7 +1350,9 @@ def scaled_matmul(
         b,
         a_scales,
         b_scales,
+        global_scale or jnp.array(0, dtype=preferred_element_type),
         preferred_element_type=preferred_element_type,
+        has_global_scale=global_scale is not None,
     )
     return out
 
diff --git a/tests/scaled_matmul_stablehlo_test.py b/tests/scaled_matmul_stablehlo_test.py
@@ -303,7 +303,7 @@ def test_collectives(self, in_shardings, block_scale_configs):
   @jtu.sample_product(
       contract=[160, 96],
       lhs_non_contract=[240, 100],
-      dtype=[jnp.float32, jnp.bfloat16, jnp.float16],
+      dtype=[jnp.float32, jnp.bfloat16],
   )
   @jtu.run_on_devices("cuda")
   def test_scaled_matmul_nvfp4(
@@ -322,15 +322,15 @@ def test_scaled_matmul_nvfp4(
     b_gs = block_scale_configs[1].global_scale
 
     def wrapper(lhs, rhs, lhs_scales, rhs_scales, out_type):
-      out = scaled_matmul_wrapper(
+      return scaled_matmul_wrapper(
           lhs,
           rhs,
           lhs_scales,
           rhs_scales,
-          preferred_element_type=jnp.float32,
+          jnp.array(a_gs * b_gs, dtype=out_type),
+          preferred_element_type=out_type,
+          has_global_scale=True,
       )
-      gs = a_gs * b_gs
-      return (out * gs).astype(out_type)
 
     j_scaled_matmul = jax.jit(partial(wrapper, out_type=dtype))
     hlo_text = (
@@ -373,7 +373,9 @@ def wrapper(lhs, rhs, lhs_scales, rhs_scales, out_type):
           rhs,
           lhs_scales,
           rhs_scales,
+          np.array(0),
           preferred_element_type=out_type,
+          has_global_scale=False,
       )
 
     j_scaled_matmul = jax.jit(partial(wrapper, out_type=dtype))
@@ -587,7 +589,7 @@ def fwd(a, b, use_normalized=False):
               True,
           ),
       ],
-      output_type=[jnp.float32, jnp.float16, jnp.bfloat16],
+      output_type=[jnp.float32, jnp.bfloat16],
   )
   @jtu.run_on_devices("cuda")
   def test_dot_general_nvfp4(self, configs, output_type):