[Mosaic GPU] Use the packing trick for int4 -> bf16 conversions

apaszke · Google-ML-Automation · commit bbec9478cc4f · 2025-11-14T05:37:12.000-08:00
PiperOrigin-RevId: 832278748
diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -1946,7 +1946,11 @@ def upcast_to_f8e4m3fn(reg: ir.Value, part: int):
     if cur_dtype == i4 and self.is_signed and new_dtype == bf16 and vector_len % 2 == 0:
       new_registers = np.empty_like(self.registers)
       out_vec_ty = ir.VectorType.get((vector_len,), new_dtype)
-      for idx, reg in np.ndenumerate(self.registers):
+      # We use packed_registers for consistency, even though the packing is not
+      # really profitable here: the PTX below begins by an op dependent on the
+      # extracted part and so there are no ops that can be shared across packed
+      # parts.
+      for indices, reg in packed_registers(2, if_not_sliced=True):
         # The algorithm here is largely the same as CUTLASS's
         # NumericArrayConverter specialization for int4 -> bf16 casts.
         # We modify it slightly, because we only extract 2 values.
@@ -1962,7 +1966,7 @@ def upcast_to_f8e4m3fn(reg: ir.Value, part: int):
         # bias coming from flipping the sign bit which is 136 (0x4308 as bits).
         def upcast_i4_to_bf16(reg: ir.Value, reg_shr: ir.Value, part: int):
           assert 0 <= part < 4
-          return llvm.inline_asm(
+          int_reg = llvm.inline_asm(
               i32,
               [reg, reg_shr],
               f"""
@@ -1976,43 +1980,43 @@ def upcast_i4_to_bf16(reg: ir.Value, reg_shr: ir.Value, part: int):
               """,
               "=r,r,r",
           )
-        offset = 0
+          return utils.bitcast(int_reg, ir.VectorType.get((2,), bf16))
+        [group_size] = ir.VectorType(reg.type).shape
+        assert group_size % vector_len == 0
+        assert group_size * 4 <= 32
+        int_ty = ir.IntegerType.get_signless(group_size * 4)
+        # If the vector originates from a slice (common after relayouts), we
+        # can fuse the slicing into the conversion and prevent LLVM from
+        # generating a bunch of shifts to align the vector data to the LSB.
+        # This also lets us share the right shift among more vectors.
         out_int_regs: list[ir.Value] = []
-        # TODO(apaszke): Use packed_registers here.
-        for group_size in (8, 4, 2):
-          int_ty = ir.IntegerType.get_signless(group_size * 4)
-          while vector_len - offset >= group_size:
-            # If the vector originates from a slice (common after relayouts), we
-            # can fuse the slicing into the conversion and prevent LLVM from
-            # generating a bunch of shifts to align the vector data to the LSB.
-            # This also lets us share the right shift among more vectors.
-            if (isinstance(slice_op := reg.owner.opview, vector.ExtractStridedSliceOp)
-                and utils.bitwidth(slice_op.source.type) == 32
-                and slice_op.strides[0].value == 1):
-              slice_offset = slice_op.offsets[0].value + offset
-              reg_int = utils.bitcast(slice_op.source, i32)
-              reg_int_shr = arith.shrui(reg_int, c(4, i32))
-              out_int_regs.extend(
-                  upcast_i4_to_bf16(reg_int, reg_int_shr, part=(slice_offset // 2 + part))
-                  for part in range(group_size // 2)
-              )
-            else:
-              reg_slice = utils.vector_slice(reg, slice(offset, offset + group_size))
-              reg_slice_int = utils.bitcast(reg_slice, int_ty)
-              if int_ty != i32:
-                reg_slice_int = arith.extsi(i32, reg_slice_int)
-              reg_slice_int_shr = arith.shrui(reg_slice_int, c(4, i32))
-              out_int_regs.extend(
-                  upcast_i4_to_bf16(reg_slice_int, reg_slice_int_shr, part=part)
-                  for part in range(group_size // 2)
-              )
-            offset += group_size
-        assert offset == vector_len
-        out_vec_int = utils.vector_concat([
-            vector.broadcast(ir.VectorType.get((1,), i32), reg)
-            for reg in out_int_regs
-        ])
-        new_registers[idx] = utils.bitcast(out_vec_int, out_vec_ty)
+        if regs_from_32bit_slice:
+          slice_op = reg.owner.opview
+          slice_offset = slice_op.offsets[0].value
+          reg_int = utils.bitcast(slice_op.source, i32)
+          reg_int_shr = arith.shrui(reg_int, c(4, i32))
+          assert slice_offset % 2 == 0
+          out_int_regs.extend(
+              upcast_i4_to_bf16(reg_int, reg_int_shr, part=slice_offset // 2 + part)
+              for part in range(group_size // 2)
+          )
+        else:
+          reg_slice_int = utils.bitcast(reg, int_ty)
+          if int_ty != i32:
+            reg_slice_int = arith.extsi(i32, reg_slice_int)
+          reg_slice_int_shr = arith.shrui(reg_slice_int, c(4, i32))
+          out_int_regs.extend(
+              upcast_i4_to_bf16(reg_slice_int, reg_slice_int_shr, part=part)
+              for part in range(group_size // 2)
+          )
+        out_reg = utils.vector_concat(out_int_regs)
+        offset = 0
+        for idx in indices:
+          new_registers[idx] = new_reg = utils.vector_slice(
+              out_reg, slice(offset, offset + vector_len)
+          )
+          offset += vector_len
+          assert new_reg.type == out_vec_ty
       return FragmentedArray(
           _registers=new_registers, _layout=self.layout, _is_signed=None
       )
@@ -2058,6 +2062,7 @@ def upcast_i4_to_i8(reg: ir.Value, first_valid_nibble: int = 0):
           ])
         [group_size] = ir.VectorType(reg.type).shape
         assert group_size % vector_len == 0
+        assert group_size * 4 <= 32
         int_ty = ir.IntegerType.get_signless(group_size * 4)
         if regs_from_32bit_slice:
           slice_op = reg.owner.opview
diff --git a/tests/mosaic/gpu_test.py b/tests/mosaic/gpu_test.py
@@ -667,21 +667,21 @@ def kernel(ctx, inp, out, smem):
           (jnp.int4, jnp.int8),
           # TODO(apaszke,bchetioui): bf16/f32 -> f8e4m3fn
       ),
-      layout_desc=(
-          "WGMMA_LAYOUT",
-          "WGMMA_LAYOUT_8BIT",
-          "WGMMA_LAYOUT_UPCAST_2X",
-          "WGMMA_LAYOUT_UPCAST_4X",
+      layout_descs=(
+          ("WGMMA_LAYOUT", "WGMMA_LAYOUT"),
+          ("WGMMA_LAYOUT_8BIT", "WGMMA_LAYOUT_8BIT"),
+          ("WGMMA_LAYOUT_UPCAST_2X", "WGMMA_LAYOUT_UPCAST_2X"),
+          ("WGMMA_LAYOUT_UPCAST_2X", "WGMMA_LAYOUT"),
+          ("WGMMA_LAYOUT_UPCAST_4X", "WGMMA_LAYOUT_UPCAST_4X"),
+          ("WGMMA_LAYOUT_UPCAST_4X", "WGMMA_LAYOUT_UPCAST_2X"),
+          ("WGMMA_LAYOUT_UPCAST_4X", "WGMMA_LAYOUT"),
       ),
-      change_layout=(False, True),
   )
   @jtu.skip_if_mosaic_gpu_exceeds_shared_memory(device_patterns="RTX PRO 6000 Blackwell")
-  def test_optimized_conversion(self, jax_dtype_from_to, layout_desc, change_layout):
-    if change_layout and layout_desc == "WGMMA_LAYOUT":
-      self.skipTest("No-op relayout")
-    if change_layout and layout_desc == "WGMMA_LAYOUT_8BIT":
-      self.skipTest("Unimplemented relayout")
-    layout: fa.TiledLayout = getattr(fa, layout_desc)
+  def test_optimized_conversion(self, jax_dtype_from_to, layout_descs):
+    layout_desc_from, layout_desc_to = layout_descs
+    layout_from: fa.TiledLayout = getattr(fa, layout_desc_from)
+    layout_to: fa.TiledLayout = getattr(fa, layout_desc_to)
     jax_dtype_from, jax_dtype_to = jax_dtype_from_to
     mlir_dtype_from = utils.dtype_to_ir_type(jax_dtype_from)
     mlir_dtype_to = utils.dtype_to_ir_type(jax_dtype_to)
@@ -692,16 +692,16 @@ def kernel(ctx, inp, out, smem):
       t = mgpu.FragmentedArray.load_untiled(
           inp,
           is_signed=utils.is_signed(jax_dtype_from),
-          layout=layout,
+          layout=layout_from,
           optimized=False,
       )
-      if change_layout:
+      if layout_from != layout_to:
         if (
-            layout == fa.WGMMA_LAYOUT_UPCAST_4X
-            and utils.bitwidth(mlir_dtype_from) > 4
+            layout_from == fa.WGMMA_LAYOUT_UPCAST_4X
+            and utils.bitwidth(mlir_dtype_from) != 4
         ):
           self.skipTest("Unimplemented relayout")
-        t = t.to_layout(fa.WGMMA_LAYOUT)
+        t = t.to_layout(layout_to)
       t = t.astype(mlir_dtype_to, is_signed=utils.is_signed(jax_dtype_to))
       t.store_untiled(out, optimized=False)
 
@@ -725,7 +725,7 @@ def _maybe_profile():
       with open(file_path, "a") as f:
         data = (
             jnp.dtype(jax_dtype_from).name, jnp.dtype(jax_dtype_to).name,
-            layout_desc, change_layout, sass().count("\n"),
+            layout_desc_from, layout_desc_to, sass().count("\n")
         )
         f.write(",".join(map(str, data)) + "\n")
         f.flush()