From 95b570d569f9a24db2544512f2eeb0a96c8a6153 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Mon, 4 Aug 2025 14:38:17 +0800
Subject: [PATCH 1/3] AVX2: Add native implementation of poly_decompose

This adds the AVX2 intrinsics implementation of poly_decompose from
https://github.com/pq-crystals/dilithium/blob/master/avx2/rounding.c.

Resolves https://github.com/pq-code-package/mldsa-native/issues/399.

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/native/api.h                            | 43 ++++++++++++
 mldsa/native/x86_64/meta.h                    | 14 ++++
 mldsa/native/x86_64/src/arith_native_x86_64.h |  6 ++
 .../x86_64/src/poly_decompose_32_avx2.c       | 66 ++++++++++++++++++
 .../x86_64/src/poly_decompose_88_avx2.c       | 68 +++++++++++++++++++
 mldsa/poly.c                                  | 14 +++-
 6 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 mldsa/native/x86_64/src/poly_decompose_32_avx2.c
 create mode 100644 mldsa/native/x86_64/src/poly_decompose_88_avx2.c

diff --git a/mldsa/native/api.h b/mldsa/native/api.h
index 688f62a1..3afa66b6 100644
--- a/mldsa/native/api.h
+++ b/mldsa/native/api.h
@@ -168,4 +168,47 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
                                                   unsigned buflen);
 #endif /* MLD_USE_NATIVE_REJ_UNIFORM_ETA4 */
 
+#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32)
+/*************************************************
+ * Name:        mld_poly_decompose_32_native
+ *
+ * Description: Native implementation of poly_decompose for GAMMA2 = (Q-1)/32.
+ *              For all coefficients c of the input polynomial,
+ *              compute high and low bits c0, c1 such
+ *              c mod MLDSA_Q = c1*(2*GAMMA2) + c0
+ *              with -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except
+ *              c1 = (MLDSA_Q-1)/(2*GAMMA2) where we set
+ *              c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0.
+ *              Assumes coefficients to be standard representatives.
+ *
+ * Arguments:   - int32_t *a1: output polynomial with coefficients c1
+ *              - int32_t *a0: output polynomial with coefficients c0
+ *              - const int32_t *a: input polynomial
+ **************************************************/
+static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a);
+#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_32 */
+
+#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88)
+/*************************************************
+ * Name:        mld_poly_decompose_88_native
+ *
+ * Description: Native implementation of poly_decompose for GAMMA2 = (Q-1)/88.
+ *              For all coefficients c of the input polynomial,
+ *              compute high and low bits c0, c1 such
+ *              c mod MLDSA_Q = c1*(2*GAMMA2) + c0
+ *              with -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except
+ *              c1 = (MLDSA_Q-1)/(2*GAMMA2) where we set
+ *              c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0.
+ *              Assumes coefficients to be standard representatives.
+ *
+ * Arguments:   - int32_t *a1: output polynomial with coefficients c1
+ *              - int32_t *a0: output polynomial with coefficients c0
+ *              - const int32_t *a: input polynomial
+ **************************************************/
+static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a);
+#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_88 */
+
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
index 8bc21317..b2630e9b 100644
--- a/mldsa/native/x86_64/meta.h
+++ b/mldsa/native/x86_64/meta.h
@@ -17,6 +17,8 @@
 #define MLD_USE_NATIVE_REJ_UNIFORM
 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA2
 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA4
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_32
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_88
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -98,6 +100,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return outlen;
 }
 
+static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+}
+
+static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
index fd3369bb..e785734c 100644
--- a/mldsa/native/x86_64/src/arith_native_x86_64.h
+++ b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -54,4 +54,10 @@ unsigned mld_rej_uniform_eta2_avx2(
 unsigned mld_rej_uniform_eta4_avx2(
     int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]);
 
+#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2)
+void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a);
+
+#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2)
+void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/native/x86_64/src/poly_decompose_32_avx2.c
new file mode 100644
index 00000000..f870b408
--- /dev/null
+++ b/mldsa/native/x86_64/src/poly_decompose_32_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include <string.h>
+#include "arith_native_x86_64.h"
+#include "consts.h"
+
+#define _mm256_blendv_epi32(a, b, mask)                        \
+  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
+                                       _mm256_castsi256_ps(b), \
+                                       _mm256_castsi256_ps(mask)))
+
+void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
+{
+  unsigned int i;
+  __m256i f, f0, f1;
+  const __m256i q =
+      _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]);
+  const __m256i hq = _mm256_srli_epi32(q, 1);
+  const __m256i v = _mm256_set1_epi32(1025);
+  const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
+  const __m256i off = _mm256_set1_epi32(127);
+  const __m256i shift = _mm256_set1_epi32(512);
+  const __m256i mask = _mm256_set1_epi32(15);
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_load_si256(&a[i]);
+    f1 = _mm256_add_epi32(f, off);
+    f1 = _mm256_srli_epi32(f1, 7);
+    f1 = _mm256_mulhi_epu16(f1, v);
+    f1 = _mm256_mulhrs_epi16(f1, shift);
+    f1 = _mm256_and_si256(f1, mask);
+    f0 = _mm256_mullo_epi32(f1, alpha);
+    f0 = _mm256_sub_epi32(f, f0);
+    f = _mm256_cmpgt_epi32(f0, hq);
+    f = _mm256_and_si256(f, q);
+    f0 = _mm256_sub_epi32(f0, f);
+    _mm256_store_si256(&a1[i], f1);
+    _mm256_store_si256(&a0[i], f0);
+  }
+}
+
+#undef _mm256_blendv_epi32
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_poly_decompose)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/native/x86_64/src/poly_decompose_88_avx2.c
new file mode 100644
index 00000000..e83a6fd7
--- /dev/null
+++ b/mldsa/native/x86_64/src/poly_decompose_88_avx2.c
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+/*
+ * This file is derived from the public domain
+ * AVX2 Dilithium implementation @[REF_AVX2].
+ */
+
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include <immintrin.h>
+#include <stdint.h>
+#include <string.h>
+#include "arith_native_x86_64.h"
+#include "consts.h"
+
+
+#define _mm256_blendv_epi32(a, b, mask)                        \
+  _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \
+                                       _mm256_castsi256_ps(b), \
+                                       _mm256_castsi256_ps(mask)))
+
+void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
+{
+  unsigned int i;
+  __m256i f, f0, f1, t;
+  const __m256i q =
+      _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]);
+  const __m256i hq = _mm256_srli_epi32(q, 1);
+  const __m256i v = _mm256_set1_epi32(11275);
+  const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
+  const __m256i off = _mm256_set1_epi32(127);
+  const __m256i shift = _mm256_set1_epi32(128);
+  const __m256i max = _mm256_set1_epi32(43);
+  const __m256i zero = _mm256_setzero_si256();
+
+  for (i = 0; i < MLDSA_N / 8; i++)
+  {
+    f = _mm256_load_si256(&a[i]);
+    f1 = _mm256_add_epi32(f, off);
+    f1 = _mm256_srli_epi32(f1, 7);
+    f1 = _mm256_mulhi_epu16(f1, v);
+    f1 = _mm256_mulhrs_epi16(f1, shift);
+    t = _mm256_sub_epi32(max, f1);
+    f1 = _mm256_blendv_epi32(f1, zero, t);
+    f0 = _mm256_mullo_epi32(f1, alpha);
+    f0 = _mm256_sub_epi32(f, f0);
+    f = _mm256_cmpgt_epi32(f0, hq);
+    f = _mm256_and_si256(f, q);
+    f0 = _mm256_sub_epi32(f0, f);
+    _mm256_store_si256(&a1[i], f1);
+    _mm256_store_si256(&a0[i], f0);
+  }
+}
+#undef _mm256_blendv_epi32
+
+#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
+       */
+
+MLD_EMPTY_CU(avx2_poly_decompose)
+
+#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
+          !MLD_CONFIG_MULTILEVEL_NO_SHARED) */
diff --git a/mldsa/poly.c b/mldsa/poly.c
index 90092a8f..49e4c168 100644
--- a/mldsa/poly.c
+++ b/mldsa/poly.c
@@ -174,9 +174,19 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a)
 
 void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
 {
+#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLDSA_MODE == 2
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs);
+#elif defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) && \
+    (MLDSA_MODE == 3 || MLDSA_MODE == 5)
+  /* TODO: proof */
+  mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
+  mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs);
+#else  /* !None && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLDSA_MODE == 3 || \
+          MLDSA_MODE == 5) */
   unsigned int i;
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
-
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly)))
@@ -187,6 +197,8 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a)
   {
     mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]);
   }
+#endif /* !None && !(MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLDSA_MODE == 3 || \
+          MLDSA_MODE == 5)) */
 
   mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1);
   mld_assert_bound(a1->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));

From 7e2191f045f7f85a71fc9b73b5cba896dfb139f1 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Mon, 4 Aug 2025 17:47:27 +0800
Subject: [PATCH 2/3] AArch64: Add native implementation of poly_decompose

This add a native implementation of poly_decompose written from scratch.

Resolves https://github.com/pq-code-package/mldsa-native/issues/397

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/native/aarch64/meta.h                   |  14 +++
 .../native/aarch64/src/arith_native_aarch64.h |   6 +
 .../aarch64/src/poly_decompose_32_asm.S       | 114 ++++++++++++++++++
 .../aarch64/src/poly_decompose_88_asm.S       | 112 +++++++++++++++++
 4 files changed, 246 insertions(+)
 create mode 100644 mldsa/native/aarch64/src/poly_decompose_32_asm.S
 create mode 100644 mldsa/native/aarch64/src/poly_decompose_88_asm.S

diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h
index f91379f4..19a554db 100644
--- a/mldsa/native/aarch64/meta.h
+++ b/mldsa/native/aarch64/meta.h
@@ -13,6 +13,8 @@
 #define MLD_USE_NATIVE_REJ_UNIFORM
 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA2
 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA4
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_32
+#define MLD_USE_NATIVE_POLY_DECOMPOSE_88
 
 /* Identifier for this backend so that source and assembly files
  * in the build can be appropriately guarded. */
@@ -93,6 +95,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len,
   return outlen;
 }
 
+static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_32_asm(a1, a0, a);
+}
+
+static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0,
+                                                    const int32_t *a)
+{
+  mld_poly_decompose_88_asm(a1, a0, a);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_AARCH64_META_H */
diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h
index e011ab4c..1b506090 100644
--- a/mldsa/native/aarch64/src/arith_native_aarch64.h
+++ b/mldsa/native/aarch64/src/arith_native_aarch64.h
@@ -62,4 +62,10 @@ unsigned mld_rej_uniform_eta2_asm(int32_t *r, const uint8_t *buf,
 unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf,
                                   unsigned buflen, const uint8_t *table);
 
+#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm)
+void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a);
+
+#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm)
+void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a);
+
 #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S
new file mode 100644
index 00000000..a9cc4212
--- /dev/null
+++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+.macro decompose32 a1, a0, input, temp
+        // Step 1: Compute ceil(a / 128) using floor((a + 127) / 128)
+        // This is the first part of computing a1 = floor(a / (2*GAMMA2))
+        // where 2*GAMMA2 = 523776. We break this into two steps:
+        // ceil(a / 128) followed by round(temp / 4092)
+        add \a1\().4s, \input\().4s, offset_127.4s
+        ushr \a1\().4s, \a1\().4s, #7
+
+        // Step 2: Barrett reduction with rounding: round(temp * 1025 / 2^22)
+        // This computes: round(ceil(a/128) / 4092)
+        // Combined: a1 ≈ round(ceil(a/128) / 4092) ≈ floor(a / 523776)
+        // sqrdmulh computes: (2 * temp * 524800 + 2^31) >> 32
+        // which is equivalent to: (temp * 1025 + 2^21) >> 22.
+        sqrdmulh \a1\().4s, \a1\().4s, barrett_const.4s
+
+        // Step 3: Mask to valid range [0, 14] since (Q-1)/(2*GAMMA2) = 15
+        and \a1\().16b, \a1\().16b, mask_15.16b
+
+        // Step 4: Compute a0 = a - a1 * 2*GAMMA2 (low part of decomposition)
+        mls \input\().4s, \a1\().4s, gamma2_2x.4s
+
+        // Step 5: Conditional reduction: if a0 > (Q-1)/2 then a0 -= Q
+        cmgt \temp\().4s, \input\().4s, q_half.4s
+        and  \temp\().16b, \temp\().16b, q.16b
+        sub \a0\().4s, \input\().4s, \temp\().4s
+.endm
+
+        /* Parameters */
+        a1_ptr          .req x0     // Output polynomial with coefficients c1
+        a0_ptr          .req x1     // Output polynomial with coefficients c0
+        a_ptr           .req x2     // Input polynomial
+
+        count           .req x3
+
+        /* Constant register assignments */
+        q               .req v20    // Q = 8380417
+        q_half          .req v21    // (Q-1)/2
+        gamma2_2x       .req v22    // 2*GAMMA2 = 523776
+        mask_15         .req v23    // mask = 15
+        offset_127      .req v24    // offset = 127
+        barrett_const   .req v25    // Barrett constant = 524800
+
+
+.text
+.global MLD_ASM_NAMESPACE(poly_decompose_32_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(poly_decompose_32_asm)
+        // Load constants into SIMD registers
+        movz w4, #57345
+        movk w4, #127, lsl #16
+        dup q.4s, w4
+
+        lsr w5, w4, #1
+        dup q_half.4s, w5
+
+        movz w7, #0xfe00
+        movk w7, #7, lsl #16
+        dup gamma2_2x.4s, w7
+
+        movi mask_15.4s, #15
+        movi offset_127.4s, #127
+
+        movz w11, #0x0200
+        movk w11, #8, lsl #16
+        dup barrett_const.4s, w11
+
+        mov count, #(64/4)
+
+poly_decompose_32_loop:
+        ldr q1, [a_ptr, #1*16]
+        ldr q2, [a_ptr, #2*16]
+        ldr q3, [a_ptr, #3*16]
+        ldr q0, [a_ptr], #4*16
+
+        decompose32 v4, v5, v1, v26
+        decompose32 v6, v7, v2, v26
+        decompose32 v16, v17, v3, v26
+        decompose32 v18, v19, v0, v26
+
+
+        str q4, [a1_ptr, #1*16]
+        str q6, [a1_ptr, #2*16]
+        str q16, [a1_ptr, #3*16]
+        str q18, [a1_ptr], #4*16
+        str q5, [a0_ptr, #1*16]
+        str q7, [a0_ptr, #2*16]
+        str q17, [a0_ptr, #3*16]
+        str q19, [a0_ptr], #4*16
+
+        subs count, count, #1
+        bne poly_decompose_32_loop
+
+        ret
+
+        .unreq a1_ptr
+        .unreq a0_ptr
+        .unreq a_ptr
+        .unreq count
+        .unreq q
+        .unreq q_half
+        .unreq gamma2_2x
+        .unreq mask_15
+        .unreq offset_127
+        .unreq barrett_const
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
diff --git a/mldsa/native/aarch64/src/poly_decompose_88_asm.S b/mldsa/native/aarch64/src/poly_decompose_88_asm.S
new file mode 100644
index 00000000..7ba8c4d6
--- /dev/null
+++ b/mldsa/native/aarch64/src/poly_decompose_88_asm.S
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+#include "../../../common.h"
+
+#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+.macro decompose88 a1, a0, input, temp
+        // Step 1: Compute ceil(a / 128) using floor((a + 127) / 128)
+        // This is the first part of computing a1 = floor(a / (2*GAMMA2))
+        // where 2*GAMMA2 = 190464. We break this into two steps:
+        // ceil(a / 128) followed by round(temp / 1488)
+        add \a1\().4s, \input\().4s, offset_127.4s
+        ushr \a1\().4s, \a1\().4s, #7
+
+        // Step 2: Barrett reduction with rounding: round(temp * 11275 / 2^24)
+        // This computes: round(ceil(a/128) / 1488)
+        // Combined: a1 ≈ round(ceil(a/128) / 1488) ≈ floor(a / 190464)
+        // sqrdmulh computes: (2 * temp * 1443201 + 2^31) >> 32
+        // which is equivalent to: (temp * 11275 + 2^23) >> 24.
+        sqrdmulh \a1\().4s, \a1\().4s, barrett_const.4s
+
+        // Step 3: Mask to valid range [0, 43] since (Q-1)/(2*GAMMA2) = 44
+        cmhi \temp\().4s, const44.4s, \a1\().4s
+        and \a1\().16b, \a1\().16b, \temp\().16b
+
+        // Step 4: Compute a0 = a - a1 * 2*GAMMA2 (low part of decomposition)
+        mls \input\().4s, \a1\().4s, gamma2_2x.4s
+
+        // Step 5: Conditional reduction: if a0 > (Q-1)/2 then a0 -= Q
+        cmgt \temp\().4s, \input\().4s, q_half.4s
+        and  \temp\().16b, \temp\().16b, q.16b
+        sub \a0\().4s, \input\().4s, \temp\().4s
+.endm
+
+        /* Parameters */
+        a1_ptr          .req x0     // Output polynomial with coefficients c1
+        a0_ptr          .req x1     // Output polynomial with coefficients c0
+        a_ptr           .req x2     // Input polynomial
+
+        count           .req x3
+
+        /* Constant register assignments */
+        q               .req v20    // Q = 8380417
+        q_half          .req v21    // (Q-1)/2
+        gamma2_2x       .req v22    // 2*GAMMA2 = 190464
+        const44        .req v23    // const = 44
+        offset_127      .req v24    // offset = 127
+        barrett_const   .req v25    // Barrett constant = 1443201
+
+.text
+.global MLD_ASM_NAMESPACE(poly_decompose_88_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(poly_decompose_88_asm)
+        // Load constants into SIMD registers
+        movz w4, #57345
+        movk w4, #127, lsl #16
+        dup q.4s, w4
+
+        lsr w5, w4, #1
+        dup q_half.4s, w5
+
+        movz w7, #0xe800
+        movk w7, #0x2, lsl #16
+        dup gamma2_2x.4s, w7
+
+        movi const44.4s, #44
+        movi offset_127.4s, #127
+
+        movz w11, #0x0581
+        movk w11, #0x16, lsl #16
+        dup barrett_const.4s, w11
+
+        mov count, #(64/4)
+poly_decompose_88_loop:
+        ldr q1, [a_ptr, #1*16]
+        ldr q2, [a_ptr, #2*16]
+        ldr q3, [a_ptr, #3*16]
+        ldr q0, [a_ptr], #4*16
+
+        decompose88 v4, v5, v1, v26
+        decompose88 v6, v7, v2, v26
+        decompose88 v16, v17, v3, v26
+        decompose88 v18, v19, v0, v26
+
+        str q4, [a1_ptr, #1*16]
+        str q6, [a1_ptr, #2*16]
+        str q16, [a1_ptr, #3*16]
+        str q18, [a1_ptr], #4*16
+        str q5, [a0_ptr, #1*16]
+        str q7, [a0_ptr, #2*16]
+        str q17, [a0_ptr, #3*16]
+        str q19, [a0_ptr], #4*16
+
+        subs count, count, #1
+        bne poly_decompose_88_loop
+
+        ret
+
+        .unreq a1_ptr
+        .unreq a0_ptr
+        .unreq a_ptr
+        .unreq count
+        .unreq q
+        .unreq q_half
+        .unreq gamma2_2x
+        .unreq const44
+        .unreq offset_127
+        .unreq barrett_const
+
+#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */

From 30d236c79797a2f64a5c1c59220dc05209559685 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Thu, 7 Aug 2025 22:27:01 +0800
Subject: [PATCH 3/3] Implement poly_use_hint on top of poly_decompose

This eliminates the helper function use_hint and instead inlines it
into poly_use_hint to be able to re-use the native poly_decompose code.
The disadvantage is that it requires an additional polynomial on the
stack.

Resolves https://github.com/pq-code-package/mldsa-native/issues/400
Resolvse https://github.com/pq-code-package/mldsa-native/issues/398

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/poly.c                            | 33 ++++++++++++++-
 mldsa/rounding.h                        | 46 ---------------------
 proofs/cbmc/poly_use_hint/Makefile      |  2 +-
 proofs/cbmc/use_hint/Makefile           | 55 -------------------------
 proofs/cbmc/use_hint/use_hint_harness.c | 11 -----
 5 files changed, 33 insertions(+), 114 deletions(-)
 delete mode 100644 proofs/cbmc/use_hint/Makefile
 delete mode 100644 proofs/cbmc/use_hint/use_hint_harness.c

diff --git a/mldsa/poly.c b/mldsa/poly.c
index 49e4c168..2b35c698 100644
--- a/mldsa/poly.c
+++ b/mldsa/poly.c
@@ -228,16 +228,47 @@ unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0,
 void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h)
 {
   unsigned int i;
+  mld_poly a0;
   mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q);
   mld_assert_bound(h->coeffs, MLDSA_N, 0, 2);
 
+  /* Reference: The reference implementation has a function use_hint operating
+   * on individual coefficients that first calls decompose and then uses the
+   * same conditionals as in the following. We instead call decompose on the
+   * entire polynomial with the benefit of being able to re-use the native
+   * poly_decompose implementation at the cost of a temporay polynomial on the
+   * stack.
+   */
+  mld_poly_decompose(b, &a0, a);
   for (i = 0; i < MLDSA_N; ++i)
   __loop__(
     invariant(i <= MLDSA_N)
     invariant(array_bound(b->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2)))
+    invariant(forall(k0, i, MLDSA_N, b->coeffs[k0] == loop_entry(*b).coeffs[k0]))
   )
   {
-    b->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]);
+    if (h->coeffs[i] != 0)
+    {
+#if MLDSA_MODE == 2
+      if (a0.coeffs[i] > 0)
+      {
+        b->coeffs[i] = (b->coeffs[i] == 43) ? 0 : b->coeffs[i] + 1;
+      }
+      else
+      {
+        b->coeffs[i] = (b->coeffs[i] == 0) ? 43 : b->coeffs[i] - 1;
+      }
+#else  /* MLDSA_MODE == 2 */
+      if (a0.coeffs[i] > 0)
+      {
+        b->coeffs[i] = (b->coeffs[i] + 1) & 15;
+      }
+      else
+      {
+        b->coeffs[i] = (b->coeffs[i] - 1) & 15;
+      }
+#endif /* MLDSA_MODE != 2 */
+    }
   }
 
   mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2));
diff --git a/mldsa/rounding.h b/mldsa/rounding.h
index d410bf47..baebe906 100644
--- a/mldsa/rounding.h
+++ b/mldsa/rounding.h
@@ -121,51 +121,5 @@ __contract__(
   return 0;
 }
 
-/*************************************************
- * Name:        mld_use_hint
- *
- * Description: Correct high bits according to hint.
- *
- * Arguments:   - int32_t a: input element
- *              - unsigned int hint: hint bit
- *
- * Returns corrected high bits.
- **************************************************/
-static MLD_INLINE int32_t mld_use_hint(int32_t a, unsigned int hint)
-__contract__(
-  requires(hint >= 0 && hint <= 1)
-  requires(a >= 0 && a < MLDSA_Q)
-  ensures(return_value >= 0 && return_value < (MLDSA_Q-1)/(2*MLDSA_GAMMA2))
-)
-{
-  int32_t a0, a1;
-
-  mld_decompose(&a0, &a1, a);
-  if (hint == 0)
-  {
-    return a1;
-  }
-
-#if MLDSA_MODE == 2
-  if (a0 > 0)
-  {
-    return (a1 == 43) ? 0 : a1 + 1;
-  }
-  else
-  {
-    return (a1 == 0) ? 43 : a1 - 1;
-  }
-#else  /* MLDSA_MODE == 2 */
-  if (a0 > 0)
-  {
-    return (a1 + 1) & 15;
-  }
-  else
-  {
-    return (a1 - 1) & 15;
-  }
-#endif /* MLDSA_MODE != 2 */
-}
-
 
 #endif /* !MLD_ROUNDING_H */
diff --git a/proofs/cbmc/poly_use_hint/Makefile b/proofs/cbmc/poly_use_hint/Makefile
index 883fefa4..9cf289e9 100644
--- a/proofs/cbmc/poly_use_hint/Makefile
+++ b/proofs/cbmc/poly_use_hint/Makefile
@@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
 PROJECT_SOURCES += $(SRCDIR)/mldsa/poly.c
 
 CHECK_FUNCTION_CONTRACTS=$(MLD_NAMESPACE)poly_use_hint
-USE_FUNCTION_CONTRACTS=mld_use_hint
+USE_FUNCTION_CONTRACTS=$(MLD_NAMESPACE)poly_decompose
 APPLY_LOOP_CONTRACTS=on
 USE_DYNAMIC_FRAMES=1
 
diff --git a/proofs/cbmc/use_hint/Makefile b/proofs/cbmc/use_hint/Makefile
deleted file mode 100644
index b06562ba..00000000
--- a/proofs/cbmc/use_hint/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) The mldsa-native project authors
-# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-include ../Makefile_params.common
-
-HARNESS_ENTRY = harness
-HARNESS_FILE = use_hint_harness
-
-# This should be a unique identifier for this proof, and will appear on the
-# Litani dashboard. It can be human-readable and contain spaces if you wish.
-PROOF_UID = use_hint
-
-DEFINES +=
-INCLUDES +=
-
-REMOVE_FUNCTION_BODY +=
-UNWINDSET +=
-
-PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c
-PROJECT_SOURCES += $(SRCDIR)/mldsa/poly.c
-
-CHECK_FUNCTION_CONTRACTS=mld_use_hint
-USE_FUNCTION_CONTRACTS=mld_decompose
-APPLY_LOOP_CONTRACTS=on
-USE_DYNAMIC_FRAMES=1
-
-# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead
-EXTERNAL_SAT_SOLVER=
-CBMCFLAGS=--smt2
-
-FUNCTION_NAME = use_hint
-
-# If this proof is found to consume huge amounts of RAM, you can set the
-# EXPENSIVE variable. With new enough versions of the proof tools, this will
-# restrict the number of EXPENSIVE CBMC jobs running at once. See the
-# documentation in Makefile.common under the "Job Pools" heading for details.
-# EXPENSIVE = true
-
-# This function is large enough to need...
-CBMC_OBJECT_BITS = 8
-
-# If you require access to a file-local ("static") function or object to conduct
-# your proof, set the following (and do not include the original source file
-# ("mldsa/poly.c") in PROJECT_SOURCES).
-# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i
-# include ../Makefile.common
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mldsa/poly.c
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar
-# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz
-# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must
-# be set before including Makefile.common, but any use of variables on the
-# left-hand side requires those variables to be defined. Hence, _SOURCE,
-# _FUNCTIONS, _OBJECTS is set after including Makefile.common.
-
-include ../Makefile.common
diff --git a/proofs/cbmc/use_hint/use_hint_harness.c b/proofs/cbmc/use_hint/use_hint_harness.c
deleted file mode 100644
index e1925ddf..00000000
--- a/proofs/cbmc/use_hint/use_hint_harness.c
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright (c) The mldsa-native project authors
-// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
-
-#include "rounding.h"
-
-void harness(void)
-{
-  int32_t a, r;
-  unsigned int hint;
-  r = mld_use_hint(a, hint);
-}