From eeb9204058a0fb53e7a48ed39bc3630961fe351e Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Wed, 2 Jul 2025 11:33:22 +0800
Subject: [PATCH 1/3] AArch64: Native poly_pointwise_montgomery

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/native/aarch64/meta.h                   |   8 ++
 .../native/aarch64/src/arith_native_aarch64.h |   3 +
 .../native/aarch64/src/pointwise_montgomery.S | 119 ++++++++++++++++++
 mldsa/native/api.h                            |  20 +++
 mldsa/poly.c                                  |   7 ++
 5 files changed, 157 insertions(+)
 create mode 100644 mldsa/native/aarch64/src/pointwise_montgomery.S

diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h
index 6b044a93..64c88d05 100644
--- a/mldsa/native/aarch64/meta.h
+++ b/mldsa/native/aarch64/meta.h
@@ -10,6 +10,7 @@
 /* Set of primitives that this backend replaces */
 #define MLD_USE_NATIVE_NTT
 #define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE
 
 /* Identifier for this backend so that source and assembly files
  * in the build can be appropriately guarded. */
@@ -31,6 +32,13 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
                mld_aarch64_intt_zetas_layer123456);
 }
 
+static MLD_INLINE void mld_pointwise_montgomery_native(
+    int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
+    const int32_t in1[MLDSA_N])
+{
+  mld_pointwise_montgomery_asm(out, in0, in1);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_AARCH64_META_H */
diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h
index d3528e6f..3acc873c 100644
--- a/mldsa/native/aarch64/src/arith_native_aarch64.h
+++ b/mldsa/native/aarch64/src/arith_native_aarch64.h
@@ -32,4 +32,7 @@ void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *);
 #define mld_intt_asm MLD_NAMESPACE(intt_asm)
 void mld_intt_asm(int32_t *, const int32_t *, const int32_t *);
 
+#define mld_pointwise_montgomery_asm MLD_NAMESPACE(mld_pointwise_montgomery_asm)
+void mld_pointwise_montgomery_asm(int32_t *, const int32_t *, const int32_t *);
+
 #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */
diff --git a/mldsa/native/aarch64/src/pointwise_montgomery.S b/mldsa/native/aarch64/src/pointwise_montgomery.S
new file mode 100644
index 00000000..bfa1d7ad
--- /dev/null
+++ b/mldsa/native/aarch64/src/pointwise_montgomery.S
@@ -0,0 +1,119 @@
+/* Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64)
+
+.macro montgomery_reduce_long res, inl, inh
+    uzp1   t0.4s, \inl\().4s, \inh\().4s
+    mul    t0.4s, t0.4s, modulus_twisted.4s
+    smlal  \inl\().2d, t0.2s, modulus.2s
+    smlal2 \inh\().2d, t0.4s, modulus.4s
+    uzp2   \res\().4s, \inl\().4s, \inh\().4s
+.endm
+
+
+.macro pmull dl, dh, a, b
+    smull  \dl\().2d, \a\().2s, \b\().2s
+    smull2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro pmlal dl, dh, a, b
+    smlal  \dl\().2d, \a\().2s, \b\().2s
+    smlal2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro save_vregs
+        sub sp, sp, #(16*4)
+        stp  d8,  d9, [sp, #16*0]
+        stp d10, d11, [sp, #16*1]
+        stp d12, d13, [sp, #16*2]
+        stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #16*0]
+        ldp d10, d11, [sp, #16*1]
+        ldp d12, d13, [sp, #16*2]
+        ldp d14, d15, [sp, #16*3]
+        add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+        save_vregs
+.endm
+
+.macro pop_stack
+        restore_vregs
+.endm
+
+out_ptr .req x0
+a0_ptr  .req x1
+b0_ptr  .req x2
+count   .req x3
+wtmp    .req w3
+
+modulus         .req v0
+modulus_twisted .req v1
+
+aa   .req v2
+bb   .req v3
+res  .req v4
+resl .req v5
+resh .req v6
+t0   .req v7
+
+q_aa  .req q2
+q_bb  .req q3
+q_res .req q4
+
+.text
+.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm)
+    push_stack
+
+    // load q = 8380417
+    movz wtmp, #57345
+    movk wtmp, #127, lsl #16
+    dup modulus.4s, wtmp
+
+    // load -q^-1 = 4236238847
+    movz wtmp, #57343
+    movk wtmp, #64639, lsl #16
+    dup modulus_twisted.4s, wtmp
+    mov count, #(MLDSA_N / 4)
+loop_start:
+
+
+    ldr q_aa, [a0_ptr], #64
+    ldr q_bb, [b0_ptr], #64
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr], #64
+
+    ldr q_aa, [a0_ptr, #-48]
+    ldr q_bb, [b0_ptr, #-48]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-48]
+
+    ldr q_aa, [a0_ptr, #-32]
+    ldr q_bb, [b0_ptr, #-32]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-32]
+
+    ldr q_aa, [a0_ptr, #-16]
+    ldr q_bb, [b0_ptr, #-16]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-16]
+
+    subs count, count, #4
+    cbnz count, loop_start
+
+    pop_stack
+    ret
+#endif /* MLD_ARITH_BACKEND_AARCH64 */
diff --git a/mldsa/native/api.h b/mldsa/native/api.h
index eb3196db..7edc87a7 100644
--- a/mldsa/native/api.h
+++ b/mldsa/native/api.h
@@ -99,4 +99,24 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N])
     static MLD_INLINE void mld_intt_native(int16_t p[MLDSA_N])
 #endif /* MLD_USE_NATIVE_INTT */
 
+#if defined(MLD_USE_NATIVE_POINTWISE)
+    /*************************************************
+     * Name:        mld_pointwise_montgomery_native
+     *
+     * Description: Pointwise multiplication of polynomials in NTT domain
+     *              representation and multiplication of resulting polynomial
+     *              by 2^{-32}.
+     *
+     * Arguments:   - int32_t out[MLDSA_N]: pointer to output polynomial
+     *              - const int32_t in0[MLDSA_N]: pointer to first input
+     *polynomial
+     *              - const int32_t in1[MLDSA_N]: pointer to second input
+     *polynomial
+     **************************************************/
+    static MLD_INLINE
+    void mld_pointwise_montgomery_native(int32_t out[MLDSA_N],
+                                         const int32_t in0[MLDSA_N],
+                                         const int32_t in1[MLDSA_N]);
+#endif /* MLD_USE_NATIVE_POINTWISE */
+
 #endif /* !MLD_NATIVE_API_H */
diff --git a/mldsa/poly.c b/mldsa/poly.c
index f97750fb..7c9e4c8d 100644
--- a/mldsa/poly.c
+++ b/mldsa/poly.c
@@ -131,6 +131,7 @@ void poly_invntt_tomont(poly *a)
 }
 #endif /* MLD_USE_NATIVE_INTT */
 
+#if !defined(MLD_USE_NATIVE_POINTWISE)
 void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b)
 {
   unsigned int i;
@@ -142,6 +143,12 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b)
     c->coeffs[i] = montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]);
   }
 }
+#else  /* !MLD_USE_NATIVE_POINTWISE */
+void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b)
+{
+  mld_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs);
+}
+#endif /* MLD_USE_NATIVE_POINTWISE */
 
 void poly_power2round(poly *a1, poly *a0, const poly *a)
 {

From 47a3e21b09ac120c797b631e325f80a7695cbe8b Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Wed, 2 Jul 2025 11:55:06 +0800
Subject: [PATCH 2/3] AVX2: Native poly_pointwise_montgomery

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 mldsa/native/x86_64/meta.h                    |   9 ++
 mldsa/native/x86_64/src/arith_native_x86_64.h |   5 +
 mldsa/native/x86_64/src/pointwise.S           | 124 ++++++++++++++++++
 3 files changed, 138 insertions(+)
 create mode 100644 mldsa/native/x86_64/src/pointwise.S

diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h
index 373cf12b..87c21ff8 100644
--- a/mldsa/native/x86_64/meta.h
+++ b/mldsa/native/x86_64/meta.h
@@ -14,6 +14,7 @@
 #define MLD_USE_NATIVE_NTT_CUSTOM_ORDER
 #define MLD_USE_NATIVE_NTT
 #define MLD_USE_NATIVE_INTT
+#define MLD_USE_NATIVE_POINTWISE
 
 #if !defined(__ASSEMBLER__)
 #include <string.h>
@@ -34,6 +35,14 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N])
   mld_invntt_avx2((__m256i *)data, mld_qdata.vec);
 }
 
+static MLD_INLINE void mld_pointwise_montgomery_native(
+    int32_t out[MLDSA_N], const int32_t in0[MLDSA_N],
+    const int32_t in1[MLDSA_N])
+{
+  mld_pointwise_montgomery_avx2((__m256i *)out, (const __m256i *)in0,
+                                (const __m256i *)in1, mld_qdata.vec);
+}
+
 #endif /* !__ASSEMBLER__ */
 
 #endif /* !MLD_NATIVE_X86_64_META_H */
diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h
index 602f7485..4058666d 100644
--- a/mldsa/native/x86_64/src/arith_native_x86_64.h
+++ b/mldsa/native/x86_64/src/arith_native_x86_64.h
@@ -19,4 +19,9 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata);
 #define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2)
 void mld_nttunpack_avx2(__m256i *r);
 
+#define mld_pointwise_montgomery_avx2 \
+  MLD_NAMESPACE(mld_pointwise_montgomery_avx2)
+void mld_pointwise_montgomery_avx2(__m256i *r, const __m256i *a,
+                                   const __m256i *b, const __m256i *mld_qdata);
+
 #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */
diff --git a/mldsa/native/x86_64/src/pointwise.S b/mldsa/native/x86_64/src/pointwise.S
new file mode 100644
index 00000000..213862cc
--- /dev/null
+++ b/mldsa/native/x86_64/src/pointwise.S
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+ /*
+  * This file is derived from the public domain
+  * AVX2 Dilithium implementation @[REF_AVX2].
+  */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
+    !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
+
+#include "consts.h"
+
+.text
+.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_avx2)
+MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_avx2)
+#consts
+vmovdqa		MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV*4(%rcx),%ymm0
+vmovdqa		MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rcx),%ymm1
+
+xor		%eax,%eax
+_looptop1:
+#load
+vmovdqa		(%rsi),%ymm2
+vmovdqa		32(%rsi),%ymm4
+vmovdqa		64(%rsi),%ymm6
+vmovdqa		(%rdx),%ymm10
+vmovdqa		32(%rdx),%ymm12
+vmovdqa		64(%rdx),%ymm14
+vpsrlq		$32,%ymm2,%ymm3
+vpsrlq		$32,%ymm4,%ymm5
+vmovshdup	%ymm6,%ymm7
+vpsrlq		$32,%ymm10,%ymm11
+vpsrlq		$32,%ymm12,%ymm13
+vmovshdup	%ymm14,%ymm15
+
+#mul
+vpmuldq		%ymm2,%ymm10,%ymm2
+vpmuldq		%ymm3,%ymm11,%ymm3
+vpmuldq		%ymm4,%ymm12,%ymm4
+vpmuldq		%ymm5,%ymm13,%ymm5
+vpmuldq		%ymm6,%ymm14,%ymm6
+vpmuldq		%ymm7,%ymm15,%ymm7
+
+#reduce
+vpmuldq		%ymm0,%ymm2,%ymm10
+vpmuldq		%ymm0,%ymm3,%ymm11
+vpmuldq		%ymm0,%ymm4,%ymm12
+vpmuldq		%ymm0,%ymm5,%ymm13
+vpmuldq		%ymm0,%ymm6,%ymm14
+vpmuldq		%ymm0,%ymm7,%ymm15
+vpmuldq		%ymm1,%ymm10,%ymm10
+vpmuldq		%ymm1,%ymm11,%ymm11
+vpmuldq		%ymm1,%ymm12,%ymm12
+vpmuldq		%ymm1,%ymm13,%ymm13
+vpmuldq		%ymm1,%ymm14,%ymm14
+vpmuldq		%ymm1,%ymm15,%ymm15
+vpsubq		%ymm10,%ymm2,%ymm2
+vpsubq		%ymm11,%ymm3,%ymm3
+vpsubq		%ymm12,%ymm4,%ymm4
+vpsubq		%ymm13,%ymm5,%ymm5
+vpsubq		%ymm14,%ymm6,%ymm6
+vpsubq		%ymm15,%ymm7,%ymm7
+vpsrlq		$32,%ymm2,%ymm2
+vpsrlq		$32,%ymm4,%ymm4
+vmovshdup	%ymm6,%ymm6
+
+#store
+vpblendd	$0xAA,%ymm3,%ymm2,%ymm2
+vpblendd	$0xAA,%ymm5,%ymm4,%ymm4
+vpblendd	$0xAA,%ymm7,%ymm6,%ymm6
+vmovdqa		%ymm2,(%rdi)
+vmovdqa		%ymm4,32(%rdi)
+vmovdqa		%ymm6,64(%rdi)
+
+add		$96,%rdi
+add		$96,%rsi
+add		$96,%rdx
+add		$1,%eax
+cmp		$10,%eax
+jb 		_looptop1
+
+vmovdqa		(%rsi),%ymm2
+vmovdqa		32(%rsi),%ymm4
+vmovdqa		(%rdx),%ymm10
+vmovdqa		32(%rdx),%ymm12
+vpsrlq		$32,%ymm2,%ymm3
+vpsrlq		$32,%ymm4,%ymm5
+vmovshdup	%ymm10,%ymm11
+vmovshdup	%ymm12,%ymm13
+
+#mul
+vpmuldq		%ymm2,%ymm10,%ymm2
+vpmuldq		%ymm3,%ymm11,%ymm3
+vpmuldq		%ymm4,%ymm12,%ymm4
+vpmuldq		%ymm5,%ymm13,%ymm5
+
+#reduce
+vpmuldq		%ymm0,%ymm2,%ymm10
+vpmuldq		%ymm0,%ymm3,%ymm11
+vpmuldq		%ymm0,%ymm4,%ymm12
+vpmuldq		%ymm0,%ymm5,%ymm13
+vpmuldq		%ymm1,%ymm10,%ymm10
+vpmuldq		%ymm1,%ymm11,%ymm11
+vpmuldq		%ymm1,%ymm12,%ymm12
+vpmuldq		%ymm1,%ymm13,%ymm13
+vpsubq		%ymm10,%ymm2,%ymm2
+vpsubq		%ymm11,%ymm3,%ymm3
+vpsubq		%ymm12,%ymm4,%ymm4
+vpsubq		%ymm13,%ymm5,%ymm5
+vpsrlq		$32,%ymm2,%ymm2
+vmovshdup	%ymm4,%ymm4
+
+#store
+vpblendd	$0x55,%ymm2,%ymm3,%ymm2
+vpblendd	$0x55,%ymm4,%ymm5,%ymm4
+vmovdqa		%ymm2,(%rdi)
+vmovdqa		%ymm4,32(%rdi)
+
+ret
+
+#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED */

From 1fcd573f91fa2d317c4c95fbfbf87e477d1d9817 Mon Sep 17 00:00:00 2001
From: "Matthias J. Kannwischer" <matthias@kannwischer.eu>
Date: Thu, 3 Jul 2025 14:44:54 +0800
Subject: [PATCH 3/3] [TEST]: Apply SLOTHY to AArch64 pointwise_montgomery.S

Signed-off-by: Matthias J. Kannwischer <matthias@kannwischer.eu>
---
 dev/aarch64_clean/src/pointwise_montgomery.S  | 119 +++++++
 mldsa/native/aarch64/src/Makefile             |  42 +++
 .../native/aarch64/src/pointwise_montgomery.S | 321 ++++++++++++++++--
 3 files changed, 456 insertions(+), 26 deletions(-)
 create mode 100644 dev/aarch64_clean/src/pointwise_montgomery.S
 create mode 100644 mldsa/native/aarch64/src/Makefile

diff --git a/dev/aarch64_clean/src/pointwise_montgomery.S b/dev/aarch64_clean/src/pointwise_montgomery.S
new file mode 100644
index 00000000..bfa1d7ad
--- /dev/null
+++ b/dev/aarch64_clean/src/pointwise_montgomery.S
@@ -0,0 +1,119 @@
+/* Copyright (c) The mldsa-native project authors
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+ */
+
+#include "../../../common.h"
+#if defined(MLD_ARITH_BACKEND_AARCH64)
+
+.macro montgomery_reduce_long res, inl, inh
+    uzp1   t0.4s, \inl\().4s, \inh\().4s
+    mul    t0.4s, t0.4s, modulus_twisted.4s
+    smlal  \inl\().2d, t0.2s, modulus.2s
+    smlal2 \inh\().2d, t0.4s, modulus.4s
+    uzp2   \res\().4s, \inl\().4s, \inh\().4s
+.endm
+
+
+.macro pmull dl, dh, a, b
+    smull  \dl\().2d, \a\().2s, \b\().2s
+    smull2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro pmlal dl, dh, a, b
+    smlal  \dl\().2d, \a\().2s, \b\().2s
+    smlal2 \dh\().2d, \a\().4s, \b\().4s
+.endm
+
+.macro save_vregs
+        sub sp, sp, #(16*4)
+        stp  d8,  d9, [sp, #16*0]
+        stp d10, d11, [sp, #16*1]
+        stp d12, d13, [sp, #16*2]
+        stp d14, d15, [sp, #16*3]
+.endm
+
+.macro restore_vregs
+        ldp  d8,  d9, [sp, #16*0]
+        ldp d10, d11, [sp, #16*1]
+        ldp d12, d13, [sp, #16*2]
+        ldp d14, d15, [sp, #16*3]
+        add sp, sp, #(16*4)
+.endm
+
+.macro push_stack
+        save_vregs
+.endm
+
+.macro pop_stack
+        restore_vregs
+.endm
+
+out_ptr .req x0
+a0_ptr  .req x1
+b0_ptr  .req x2
+count   .req x3
+wtmp    .req w3
+
+modulus         .req v0
+modulus_twisted .req v1
+
+aa   .req v2
+bb   .req v3
+res  .req v4
+resl .req v5
+resh .req v6
+t0   .req v7
+
+q_aa  .req q2
+q_bb  .req q3
+q_res .req q4
+
+.text
+.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_asm)
+.balign 4
+MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm)
+    push_stack
+
+    // load q = 8380417
+    movz wtmp, #57345
+    movk wtmp, #127, lsl #16
+    dup modulus.4s, wtmp
+
+    // load -q^-1 = 4236238847
+    movz wtmp, #57343
+    movk wtmp, #64639, lsl #16
+    dup modulus_twisted.4s, wtmp
+    mov count, #(MLDSA_N / 4)
+loop_start:
+
+
+    ldr q_aa, [a0_ptr], #64
+    ldr q_bb, [b0_ptr], #64
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr], #64
+
+    ldr q_aa, [a0_ptr, #-48]
+    ldr q_bb, [b0_ptr, #-48]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-48]
+
+    ldr q_aa, [a0_ptr, #-32]
+    ldr q_bb, [b0_ptr, #-32]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-32]
+
+    ldr q_aa, [a0_ptr, #-16]
+    ldr q_bb, [b0_ptr, #-16]
+    pmull resl, resh, aa, bb
+    montgomery_reduce_long res, resl, resh
+    str q_res, [out_ptr, #-16]
+
+    subs count, count, #4
+    cbnz count, loop_start
+
+    pop_stack
+    ret
+#endif /* MLD_ARITH_BACKEND_AARCH64 */
diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile
new file mode 100644
index 00000000..db8f5554
--- /dev/null
+++ b/mldsa/native/aarch64/src/Makefile
@@ -0,0 +1,42 @@
+# Copyright (c) The mldsa-native project authors
+# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
+
+######
+# To run, see the README.md file
+######
+.PHONY: all clean
+
+# ISA to optimize for
+TARGET_ISA=Arm_AArch64
+
+# MicroArch target to optimize for
+TARGET_MICROARCH=Arm_Neoverse_N1_experimental
+
+SLOTHY_EXTRA_FLAGS ?=
+
+SLOTHY_FLAGS=-c sw_pipelining.enabled=true \
+             -c inputs_are_outputs \
+             -c sw_pipelining.minimize_overlapping=False \
+             -c sw_pipelining.allow_post \
+             -c variable_size \
+             -c constraints.stalls_first_attempt=64 \
+             $(SLOTHY_EXTRA_FLAGS)
+
+# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. 
+# Allow SLOTHY to use all V-registers, but only caller-saved GPRs.
+RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]"
+
+# Used for kernels which don't stash callee-saved registers.
+# Restrict SLOTHY to caller-saved registers.
+RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]"
+
+all: pointwise_montgomery.S 
+
+# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use
+# those registers.
+pointwise_montgomery.S: ../../../../dev/aarch64_clean/src/pointwise_montgomery.S
+	slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG)
+
+
+clean:
+	-$(RM) -rf pointwise_montgomery.S
diff --git a/mldsa/native/aarch64/src/pointwise_montgomery.S b/mldsa/native/aarch64/src/pointwise_montgomery.S
index bfa1d7ad..f613f33a 100644
--- a/mldsa/native/aarch64/src/pointwise_montgomery.S
+++ b/mldsa/native/aarch64/src/pointwise_montgomery.S
@@ -84,35 +84,304 @@ MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm)
     movk wtmp, #64639, lsl #16
     dup modulus_twisted.4s, wtmp
     mov count, #(MLDSA_N / 4)
+                                             // Instructions:    40
+                                             // Expected cycles: 28
+                                             // Expected IPC:    1.43
+                                             //
+                                             // Cycle bound:     28.0
+                                             // IPC bound:       1.43
+                                             //
+                                             // Wall time:     0.19s
+                                             // User time:     0.19s
+                                             //
+                                             // ----- cycle (expected) ------>
+                                             // 0                        25
+                                             // |------------------------|----
+        ldr q19, [x1], #64                   // *.............................
+        ldr q14, [x2], #64                   // *.............................
+        ldr q12, [x1, #-48]                  // .*............................
+        ldr q6, [x2, #-48]                   // .*............................
+        ldr q29, [x2, #-16]                  // ..*...........................
+        ldr q25, [x1, #-16]                  // ..*...........................
+        ldr q4, [x2, #16]                    // ...*..........................
+        ldr q31, [x1, #16]                   // ...*..........................
+        ldr q17, [x1, #-32]                  // ....*.........................
+        smull v5.2D, v19.2S, v14.2S          // ....*.........................
+        smull2 v28.2D, v19.4S, v14.4S        // .....*........................
+        ldr q20, [x2, #-32]                  // .....*........................
+        smull v22.2D, v12.2S, v6.2S          // ......*.......................
+        ldr q16, [x1], #64                   // ......*.......................
+        smull2 v8.2D, v12.4S, v6.4S          // .......*......................
+        ldr q2, [x2], #64                    // .......*......................
+        smull2 v23.2D, v25.4S, v29.4S        // ........*.....................
+        smull v29.2D, v25.2S, v29.2S         // .........*....................
+        uzp1 v10.4S, v5.4S, v28.4S           // .........*....................
+        smull v11.2D, v31.2S, v4.2S          // ..........*...................
+        mul v30.4S, v10.4S, v1.4S            // ...........*..................
+        uzp1 v7.4S, v22.4S, v8.4S            // ...........*..................
+        smull2 v18.2D, v31.4S, v4.4S         // .............*................
+        uzp1 v26.4S, v29.4S, v23.4S          // .............*................
+        mul v19.4S, v7.4S, v1.4S             // ..............*...............
+        smull v14.2D, v17.2S, v20.2S         // ................*.............
+        smull2 v17.2D, v17.4S, v20.4S        // .................*............
+        uzp1 v15.4S, v11.4S, v18.4S          // .................*............
+        smlal v5.2D, v30.2S, v0.2S           // ..................*...........
+        mul v4.4S, v26.4S, v1.4S             // ...................*..........
+        smlal2 v28.2D, v30.4S, v0.4S         // .....................*........
+        uzp1 v12.4S, v14.4S, v17.4S          // .....................*........
+        smlal v22.2D, v19.2S, v0.2S          // ......................*.......
+        smlal2 v8.2D, v19.4S, v0.4S          // .......................*......
+        smlal2 v23.2D, v4.4S, v0.4S          // ........................*.....
+        smlal v29.2D, v4.2S, v0.2S           // .........................*....
+        uzp2 v28.4S, v5.4S, v28.4S           // .........................*....
+        smull v26.2D, v16.2S, v2.2S          // ..........................*...
+        mul v5.4S, v15.4S, v1.4S             // ...........................*..
+        uzp2 v31.4S, v22.4S, v8.4S           // ...........................*..
+
+                                              // ------ cycle (expected) ------>
+                                              // 0                        25
+                                              // |------------------------|-----
+        // ldr q2, [x2], #64                  // *..............................
+        // ldr q16, [x1], #64                 // *..............................
+        // ldr q25, [x1, #-48]                // .*.............................
+        // ldr q15, [x2, #-48]                // .*.............................
+        // smull v11.2D, v25.2S, v15.2S       // ......*........................
+        // smull2 v18.2D, v25.4S, v15.4S      // .......*.......................
+        // uzp1 v9.4S, v11.4S, v18.4S         // ...........*...................
+        // smull v26.2D, v16.2S, v2.2S        // ....*..........................
+        // mul v5.4S, v9.4S, v1.4S            // ..............*................
+        // ldr q20, [x1, #-16]                // ..*............................
+        // ldr q19, [x2, #-16]                // ..*............................
+        // smull2 v8.2D, v16.4S, v2.4S        // .....*.........................
+        // ldr q2, [x2], #64                  // .......*.......................
+        // ldr q16, [x1], #64                 // ......*........................
+        // ldr q25, [x1, #-48]                // ...*...........................
+        // smull v29.2D, v20.2S, v19.2S       // .........*.....................
+        // smull2 v23.2D, v20.4S, v19.4S      // ........*......................
+        // uzp1 v24.4S, v26.4S, v8.4S         // .........*.....................
+        // smlal v11.2D, v5.2S, v0.2S         // ......................*........
+        // mul v22.4S, v24.4S, v1.4S          // ...........*...................
+        // ldr q15, [x2, #-48]                // ...*...........................
+        // ldr q30, [x1, #-96]                // ....*..........................
+        // uzp1 v19.4S, v29.4S, v23.4S        // .............*.................
+        // smlal2 v18.2D, v5.4S, v0.4S        // .......................*.......
+        // smlal2 v8.2D, v22.4S, v0.4S        // .....................*.........
+        // mul v19.4S, v19.4S, v1.4S          // ...................*...........
+        // ldr q6, [x2, #-96]                 // .....*.........................
+        // uzp2 v31.4S, v11.4S, v18.4S        // ...........................*...
+        // smull v11.2D, v25.2S, v15.2S       // ..........*....................
+        // smull2 v18.2D, v25.4S, v15.4S      // .............*.................
+        // smlal v26.2D, v22.2S, v0.2S        // ..................*............
+        // smull v14.2D, v30.2S, v6.2S        // ................*..............
+        // smull2 v17.2D, v30.4S, v6.4S       // .................*.............
+        // smlal v29.2D, v19.2S, v0.2S        // .........................*.....
+        // uzp1 v9.4S, v11.4S, v18.4S         // .................*.............
+        // smlal2 v23.2D, v19.4S, v0.4S       // ........................*......
+        // uzp2 v28.4S, v26.4S, v8.4S         // .........................*.....
+        // smull v26.2D, v16.2S, v2.2S        // ..........................*....
+        // mul v5.4S, v9.4S, v1.4S            // ...........................*...
+        // uzp1 v12.4S, v14.4S, v17.4S        // .....................*.........
+
+        sub count, count, #8
 loop_start:
+                                             // Instructions:    40
+                                             // Expected cycles: 24
+                                             // Expected IPC:    1.67
+                                             //
+                                             // Cycle bound:     24.0
+                                             // IPC bound:       1.67
+                                             //
+                                             // Wall time:     0.53s
+                                             // User time:     0.53s
+                                             //
+                                             // ----- cycle (expected) ------>
+                                             // 0                        25
+                                             // |------------------------|----
+        ldr q20, [x1, #-16]                  // *.............................
+        ldr q19, [x2, #-16]                  // *.............................
+        smull2 v8.2D, v16.4S, v2.4S          // .*............................
+        ldr q2, [x2], #64                    // .e............................
+        mul v3.4S, v12.4S, v1.4S             // ..l...........................
+        ldr q16, [x1], #64                   // ..e...........................
+        ldr q25, [x1, #-48]                  // ...e..........................
+        str q31, [x0, #16]                   // ...l..........................
+        uzp2 v4.4S, v29.4S, v23.4S           // ....l.........................
+        smull v29.2D, v20.2S, v19.2S         // ....*.........................
+        smull2 v23.2D, v20.4S, v19.4S        // .....*........................
+        uzp1 v24.4S, v26.4S, v8.4S           // .....*........................
+        smlal v11.2D, v5.2S, v0.2S           // ......*.......................
+        str q28, [x0], #64                   // ......l.......................
+        mul v22.4S, v24.4S, v1.4S            // .......*......................
+        ldr q15, [x2, #-48]                  // .......e......................
+        ldr q30, [x1, #-96]                  // ........*.....................
+        smlal v14.2D, v3.2S, v0.2S           // .........l....................
+        uzp1 v19.4S, v29.4S, v23.4S          // .........*....................
+        smlal2 v18.2D, v5.4S, v0.4S          // ..........*...................
+        str q4, [x0, #-16]                   // ..........l...................
+        smlal2 v17.2D, v3.4S, v0.4S          // ...........l..................
+        smlal2 v8.2D, v22.4S, v0.4S          // ............*.................
+        mul v19.4S, v19.4S, v1.4S            // .............*................
+        ldr q6, [x2, #-96]                   // ..............*...............
+        uzp2 v31.4S, v11.4S, v18.4S          // ...............*..............
+        smull v11.2D, v25.2S, v15.2S         // ...............e..............
+        smull2 v18.2D, v25.4S, v15.4S        // ................e.............
+        uzp2 v21.4S, v14.4S, v17.4S          // ................l.............
+        smlal v26.2D, v22.2S, v0.2S          // .................*............
+        smull v14.2D, v30.2S, v6.2S          // ..................*...........
+        str q21, [x0, #-32]                  // ..................l...........
+        smull2 v17.2D, v30.4S, v6.4S         // ...................*..........
+        smlal v29.2D, v19.2S, v0.2S          // ....................*.........
+        uzp1 v9.4S, v11.4S, v18.4S           // ....................e.........
+        smlal2 v23.2D, v19.4S, v0.4S         // .....................*........
+        uzp2 v28.4S, v26.4S, v8.4S           // .....................*........
+        smull v26.2D, v16.2S, v2.2S          // ......................e.......
+        mul v5.4S, v9.4S, v1.4S              // .......................e......
+        uzp1 v12.4S, v14.4S, v17.4S          // .......................*......
+
+                                           // ----------------------- cycle (expected) ------------------------>
+                                           // 0                        25                       50
+                                           // |------------------------|------------------------|---------------
+        // ldr q2, [x1], #64               // .e.....................'.~.....................'.~................
+        // ldr q3, [x2], #64               // e......................'~......................'~.................
+        // smull  v5.2d, v2.2s, v3.2s      // .....................e.'.....................~.'..................
+        // smull2 v6.2d, v2.4s, v3.4s      // ~......................'*......................'~.................
+        // uzp1   v7.4s, v5.4s, v6.4s      // ....~..................'....*..................'....~.............
+        // mul    v7.4s, v7.4s, v1.4s      // ......~................'......*................'......~...........
+        // smlal  v5.2d, v7.2s, v0.2s      // ................~......'................*......'................~.
+        // smlal2 v6.2d, v7.4s, v0.4s      // ...........~...........'...........*...........'...........~......
+        // uzp2   v4.4s, v5.4s, v6.4s      // ....................~..'....................*..'..................
+        // str q4, [x0], #64               // .....~.................'.....~.................'.....l............
+        // ldr q2, [x1, #-48]              // ..e....................'..~....................'..~...............
+        // ldr q3, [x2, #-48]              // ......e................'......~................'......~...........
+        // smull  v5.2d, v2.2s, v3.2s      // ..............e........'..............~........'..............~...
+        // smull2 v6.2d, v2.4s, v3.4s      // ...............e.......'...............~.......'...............~..
+        // uzp1   v7.4s, v5.4s, v6.4s      // ...................e...'...................~...'..................
+        // mul    v7.4s, v7.4s, v1.4s      // ......................e'......................~'..................
+        // smlal  v5.2d, v7.2s, v0.2s      // .....~.................'.....*.................'.....~............
+        // smlal2 v6.2d, v7.4s, v0.4s      // .........~.............'.........*.............'.........~........
+        // uzp2   v4.4s, v5.4s, v6.4s      // ..............~........'..............*........'..............~...
+        // str q4, [x0, #-48]              // ..~....................'..~....................'..l...............
+        // ldr q2, [x1, #-32]              // .......~...............'.......*...............'.......~..........
+        // ldr q3, [x2, #-32]              // .............~.........'.............*.........'.............~....
+        // smull  v5.2d, v2.2s, v3.2s      // .................~.....'.................*.....'..................
+        // smull2 v6.2d, v2.4s, v3.4s      // ..................~....'..................*....'..................
+        // uzp1   v7.4s, v5.4s, v6.4s      // ......................~'......................*'..................
+        // mul    v7.4s, v7.4s, v1.4s      // .~.....................'.~.....................'.l................
+        // smlal  v5.2d, v7.2s, v0.2s      // ........~..............'........~..............'........l.........
+        // smlal2 v6.2d, v7.4s, v0.4s      // ..........~............'..........~............'..........l.......
+        // uzp2   v4.4s, v5.4s, v6.4s      // ...............~.......'...............~.......'...............l..
+        // str q4, [x0, #-32]              // .................~.....'.................~.....'.................l
+        // ldr q2, [x1, #-16]              // .......................*.......................~..................
+        // ldr q3, [x2, #-16]              // .......................*.......................~..................
+        // smull  v5.2d, v2.2s, v3.2s      // ...~...................'...*...................'...~..............
+        // smull2 v6.2d, v2.4s, v3.4s      // ....~..................'....*..................'....~.............
+        // uzp1   v7.4s, v5.4s, v6.4s      // ........~..............'........*..............'........~.........
+        // mul    v7.4s, v7.4s, v1.4s      // ............~..........'............*..........'............~.....
+        // smlal  v5.2d, v7.2s, v0.2s      // ...................~...'...................*...'..................
+        // smlal2 v6.2d, v7.4s, v0.4s      // ....................~..'....................*..'..................
+        // uzp2   v4.4s, v5.4s, v6.4s      // ...~...................'...~...................'...l..............
+        // str q4, [x0, #-16]              // .........~.............'.........~.............'.........l........
+
+        subs count, count, 4
+        cbnz count, loop_start
+                                            // Instructions:    40
+                                            // Expected cycles: 29
+                                            // Expected IPC:    1.38
+                                            //
+                                            // Cycle bound:     29.0
+                                            // IPC bound:       1.38
+                                            //
+                                            // Wall time:     0.21s
+                                            // User time:     0.21s
+                                            //
+                                            // ----- cycle (expected) ------>
+                                            // 0                        25
+                                            // |------------------------|----
+        uzp2 v22.4S, v29.4S, v23.4S         // *.............................
+        mul v19.4S, v12.4S, v1.4S           // *.............................
+        ldr q27, [x1, #-16]                 // .*............................
+        ldr q13, [x2, #-16]                 // .*............................
+        str q28, [x0], #64                  // ..*...........................
+        smlal v11.2D, v5.2S, v0.2S          // ..*...........................
+        smlal2 v18.2D, v5.4S, v0.4S         // ...*..........................
+        ldr q29, [x1, #-32]                 // ...*..........................
+        smull2 v16.2D, v16.4S, v2.4S        // ....*.........................
+        ldr q8, [x2, #-32]                  // ....*.........................
+        str q22, [x0, #-16]                 // .....*........................
+        smull2 v5.2D, v27.4S, v13.4S        // .....*........................
+        smlal2 v17.2D, v19.4S, v0.4S        // ......*.......................
+        str q31, [x0, #-48]                 // ......*.......................
+        uzp2 v3.4S, v11.4S, v18.4S          // .......*......................
+        smull v11.2D, v27.2S, v13.2S        // .......*......................
+        uzp1 v24.4S, v26.4S, v16.4S         // ........*.....................
+        smull v22.2D, v29.2S, v8.2S         // ........*.....................
+        smull2 v12.2D, v29.4S, v8.4S        // .........*....................
+        str q3, [x0, #16]                   // .........*....................
+        smlal v14.2D, v19.2S, v0.2S         // ..........*...................
+        mul v28.4S, v24.4S, v1.4S           // ...........*..................
+        uzp1 v4.4S, v11.4S, v5.4S           // ...........*..................
+        mul v31.4S, v4.4S, v1.4S            // .............*................
+        uzp1 v30.4S, v22.4S, v12.4S         // .............*................
+        uzp2 v21.4S, v14.4S, v17.4S         // ..............*...............
+        mul v6.4S, v30.4S, v1.4S            // ...............*..............
+        str q21, [x0, #-32]                 // ................*.............
+        smlal v26.2D, v28.2S, v0.2S         // .................*............
+        smlal2 v16.2D, v28.4S, v0.4S        // ..................*...........
+        smlal2 v5.2D, v31.4S, v0.4S         // ...................*..........
+        smlal v11.2D, v31.2S, v0.2S         // ....................*.........
+        smlal2 v12.2D, v6.4S, v0.4S         // .....................*........
+        smlal v22.2D, v6.2S, v0.2S          // ......................*.......
+        uzp2 v10.4S, v26.4S, v16.4S         // ......................*.......
+        uzp2 v13.4S, v11.4S, v5.4S          // ........................*.....
+        str q10, [x0], #64                  // ........................*.....
+        str q13, [x0, #-16]                 // ..........................*...
+        uzp2 v13.4S, v22.4S, v12.4S         // ..........................*...
+        str q13, [x0, #-32]                 // ............................*.
 
+                                              // ------ cycle (expected) ------>
+                                              // 0                        25
+                                              // |------------------------|-----
+        // ldr q20, [x1, #-16]                // .*.............................
+        // ldr q19, [x2, #-16]                // .*.............................
+        // smull2 v8.2D, v16.4S, v2.4S        // ....*..........................
+        // mul v3.4S, v12.4S, v1.4S           // *..............................
+        // str q31, [x0, #16]                 // ......*........................
+        // uzp2 v4.4S, v29.4S, v23.4S         // *..............................
+        // smull v29.2D, v20.2S, v19.2S       // .......*.......................
+        // smull2 v23.2D, v20.4S, v19.4S      // .....*.........................
+        // uzp1 v24.4S, v26.4S, v8.4S         // ........*......................
+        // smlal v11.2D, v5.2S, v0.2S         // ..*............................
+        // str q28, [x0], #64                 // ..*............................
+        // mul v22.4S, v24.4S, v1.4S          // ...........*...................
+        // ldr q30, [x1, #-32]                // ...*...........................
+        // smlal v14.2D, v3.2S, v0.2S         // ..........*....................
+        // uzp1 v19.4S, v29.4S, v23.4S        // ...........*...................
+        // smlal2 v18.2D, v5.4S, v0.4S        // ...*...........................
+        // str q4, [x0, #-16]                 // .....*.........................
+        // smlal2 v17.2D, v3.4S, v0.4S        // ......*........................
+        // smlal2 v8.2D, v22.4S, v0.4S        // ..................*............
+        // mul v19.4S, v19.4S, v1.4S          // .............*.................
+        // ldr q6, [x2, #-32]                 // ....*..........................
+        // uzp2 v31.4S, v11.4S, v18.4S        // .......*.......................
+        // uzp2 v21.4S, v14.4S, v17.4S        // ..............*................
+        // smlal v26.2D, v22.2S, v0.2S        // .................*.............
+        // smull v14.2D, v30.2S, v6.2S        // ........*......................
+        // str q21, [x0, #-32]                // ................*..............
+        // smull2 v17.2D, v30.4S, v6.4S       // .........*.....................
+        // smlal v29.2D, v19.2S, v0.2S        // ....................*..........
+        // smlal2 v23.2D, v19.4S, v0.4S       // ...................*...........
+        // uzp2 v28.4S, v26.4S, v8.4S         // ......................*........
+        // uzp1 v12.4S, v14.4S, v17.4S        // .............*.................
+        // mul v3.4S, v12.4S, v1.4S           // ...............*...............
+        // str q31, [x0, #16]                 // .........*.....................
+        // uzp2 v4.4S, v29.4S, v23.4S         // ........................*......
+        // str q28, [x0], #64                 // ........................*......
+        // smlal v14.2D, v3.2S, v0.2S         // ......................*........
+        // str q4, [x0, #-16]                 // ..........................*....
+        // smlal2 v17.2D, v3.4S, v0.4S        // .....................*.........
+        // uzp2 v21.4S, v14.4S, v17.4S        // ..........................*....
+        // str q21, [x0, #-32]                // ............................*..
 
-    ldr q_aa, [a0_ptr], #64
-    ldr q_bb, [b0_ptr], #64
-    pmull resl, resh, aa, bb
-    montgomery_reduce_long res, resl, resh
-    str q_res, [out_ptr], #64
-
-    ldr q_aa, [a0_ptr, #-48]
-    ldr q_bb, [b0_ptr, #-48]
-    pmull resl, resh, aa, bb
-    montgomery_reduce_long res, resl, resh
-    str q_res, [out_ptr, #-48]
-
-    ldr q_aa, [a0_ptr, #-32]
-    ldr q_bb, [b0_ptr, #-32]
-    pmull resl, resh, aa, bb
-    montgomery_reduce_long res, resl, resh
-    str q_res, [out_ptr, #-32]
-
-    ldr q_aa, [a0_ptr, #-16]
-    ldr q_bb, [b0_ptr, #-16]
-    pmull resl, resh, aa, bb
-    montgomery_reduce_long res, resl, resh
-    str q_res, [out_ptr, #-16]
-
-    subs count, count, #4
-    cbnz count, loop_start
 
     pop_stack
     ret