From eeb9204058a0fb53e7a48ed39bc3630961fe351e Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 2 Jul 2025 11:33:22 +0800 Subject: [PATCH 1/3] AArch64: Native poly_pointwise_montgomery Signed-off-by: Matthias J. Kannwischer --- mldsa/native/aarch64/meta.h | 8 ++ .../native/aarch64/src/arith_native_aarch64.h | 3 + .../native/aarch64/src/pointwise_montgomery.S | 119 ++++++++++++++++++ mldsa/native/api.h | 20 +++ mldsa/poly.c | 7 ++ 5 files changed, 157 insertions(+) create mode 100644 mldsa/native/aarch64/src/pointwise_montgomery.S diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h index 6b044a93..64c88d05 100644 --- a/mldsa/native/aarch64/meta.h +++ b/mldsa/native/aarch64/meta.h @@ -10,6 +10,7 @@ /* Set of primitives that this backend replaces */ #define MLD_USE_NATIVE_NTT #define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ @@ -31,6 +32,13 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N]) mld_aarch64_intt_zetas_layer123456); } +static MLD_INLINE void mld_pointwise_montgomery_native( + int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], + const int32_t in1[MLDSA_N]) +{ + mld_pointwise_montgomery_asm(out, in0, in1); +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_NATIVE_AARCH64_META_H */ diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h index d3528e6f..3acc873c 100644 --- a/mldsa/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/native/aarch64/src/arith_native_aarch64.h @@ -32,4 +32,7 @@ void mld_ntt_asm(int32_t *, const int32_t *, const int32_t *); #define mld_intt_asm MLD_NAMESPACE(intt_asm) void mld_intt_asm(int32_t *, const int32_t *, const int32_t *); +#define mld_pointwise_montgomery_asm MLD_NAMESPACE(mld_pointwise_montgomery_asm) +void mld_pointwise_montgomery_asm(int32_t *, const int32_t *, const int32_t *); + #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */ diff --git a/mldsa/native/aarch64/src/pointwise_montgomery.S b/mldsa/native/aarch64/src/pointwise_montgomery.S new file mode 100644 index 00000000..bfa1d7ad --- /dev/null +++ b/mldsa/native/aarch64/src/pointwise_montgomery.S @@ -0,0 +1,119 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro montgomery_reduce_long res, inl, inh + uzp1 t0.4s, \inl\().4s, \inh\().4s + mul t0.4s, t0.4s, modulus_twisted.4s + smlal \inl\().2d, t0.2s, modulus.2s + smlal2 \inh\().2d, t0.4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +out_ptr .req x0 +a0_ptr .req x1 +b0_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +aa .req v2 +bb .req v3 +res .req v4 +resl .req v5 +resh .req v6 +t0 .req v7 + +q_aa .req q2 +q_bb .req q3 +q_res .req q4 + +.text +.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + mov count, #(MLDSA_N / 4) +loop_start: + + + ldr q_aa, [a0_ptr], #64 + ldr q_bb, [b0_ptr], #64 + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr], #64 + + ldr q_aa, [a0_ptr, #-48] + ldr q_bb, [b0_ptr, #-48] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-48] + + ldr q_aa, [a0_ptr, #-32] + ldr q_bb, [b0_ptr, #-32] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-32] + + ldr q_aa, [a0_ptr, #-16] + ldr q_bb, [b0_ptr, #-16] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-16] + + subs count, count, #4 + cbnz count, loop_start + + pop_stack + ret +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/api.h b/mldsa/native/api.h index eb3196db..7edc87a7 100644 --- a/mldsa/native/api.h +++ b/mldsa/native/api.h @@ -99,4 +99,24 @@ static MLD_INLINE void mld_poly_permute_bitrev_to_custom(int32_t p[MLDSA_N]) static MLD_INLINE void mld_intt_native(int16_t p[MLDSA_N]) #endif /* MLD_USE_NATIVE_INTT */ +#if defined(MLD_USE_NATIVE_POINTWISE) + /************************************************* + * Name: mld_pointwise_montgomery_native + * + * Description: Pointwise multiplication of polynomials in NTT domain + * representation and multiplication of resulting polynomial + * by 2^{-32}. + * + * Arguments: - int32_t out[MLDSA_N]: pointer to output polynomial + * - const int32_t in0[MLDSA_N]: pointer to first input + *polynomial + * - const int32_t in1[MLDSA_N]: pointer to second input + *polynomial + **************************************************/ + static MLD_INLINE + void mld_pointwise_montgomery_native(int32_t out[MLDSA_N], + const int32_t in0[MLDSA_N], + const int32_t in1[MLDSA_N]); +#endif /* MLD_USE_NATIVE_POINTWISE */ + #endif /* !MLD_NATIVE_API_H */ diff --git a/mldsa/poly.c b/mldsa/poly.c index f97750fb..7c9e4c8d 100644 --- a/mldsa/poly.c +++ b/mldsa/poly.c @@ -131,6 +131,7 @@ void poly_invntt_tomont(poly *a) } #endif /* MLD_USE_NATIVE_INTT */ +#if !defined(MLD_USE_NATIVE_POINTWISE) void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) { unsigned int i; @@ -142,6 +143,12 @@ void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) c->coeffs[i] = montgomery_reduce((int64_t)a->coeffs[i] * b->coeffs[i]); } } +#else /* !MLD_USE_NATIVE_POINTWISE */ +void poly_pointwise_montgomery(poly *c, const poly *a, const poly *b) +{ + mld_pointwise_montgomery_native(c->coeffs, a->coeffs, b->coeffs); +} +#endif /* MLD_USE_NATIVE_POINTWISE */ void poly_power2round(poly *a1, poly *a0, const poly *a) { From 47a3e21b09ac120c797b631e325f80a7695cbe8b Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Wed, 2 Jul 2025 11:55:06 +0800 Subject: [PATCH 2/3] AVX2: Native poly_pointwise_montgomery Signed-off-by: Matthias J. Kannwischer --- mldsa/native/x86_64/meta.h | 9 ++ mldsa/native/x86_64/src/arith_native_x86_64.h | 5 + mldsa/native/x86_64/src/pointwise.S | 124 ++++++++++++++++++ 3 files changed, 138 insertions(+) create mode 100644 mldsa/native/x86_64/src/pointwise.S diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h index 373cf12b..87c21ff8 100644 --- a/mldsa/native/x86_64/meta.h +++ b/mldsa/native/x86_64/meta.h @@ -14,6 +14,7 @@ #define MLD_USE_NATIVE_NTT_CUSTOM_ORDER #define MLD_USE_NATIVE_NTT #define MLD_USE_NATIVE_INTT +#define MLD_USE_NATIVE_POINTWISE #if !defined(__ASSEMBLER__) #include @@ -34,6 +35,14 @@ static MLD_INLINE void mld_intt_native(int32_t data[MLDSA_N]) mld_invntt_avx2((__m256i *)data, mld_qdata.vec); } +static MLD_INLINE void mld_pointwise_montgomery_native( + int32_t out[MLDSA_N], const int32_t in0[MLDSA_N], + const int32_t in1[MLDSA_N]) +{ + mld_pointwise_montgomery_avx2((__m256i *)out, (const __m256i *)in0, + (const __m256i *)in1, mld_qdata.vec); +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_NATIVE_X86_64_META_H */ diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h index 602f7485..4058666d 100644 --- a/mldsa/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/native/x86_64/src/arith_native_x86_64.h @@ -19,4 +19,9 @@ void mld_invntt_avx2(__m256i *r, const __m256i *mld_qdata); #define mld_nttunpack_avx2 MLD_NAMESPACE(nttunpack_avx2) void mld_nttunpack_avx2(__m256i *r); +#define mld_pointwise_montgomery_avx2 \ + MLD_NAMESPACE(mld_pointwise_montgomery_avx2) +void mld_pointwise_montgomery_avx2(__m256i *r, const __m256i *a, + const __m256i *b, const __m256i *mld_qdata); + #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */ diff --git a/mldsa/native/x86_64/src/pointwise.S b/mldsa/native/x86_64/src/pointwise.S new file mode 100644 index 00000000..213862cc --- /dev/null +++ b/mldsa/native/x86_64/src/pointwise.S @@ -0,0 +1,124 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + /* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include "consts.h" + +.text +.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_avx2) +MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_avx2) +#consts +vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQINV*4(%rcx),%ymm0 +vmovdqa MLD_AVX2_BACKEND_DATA_OFFSET_8XQ*4(%rcx),%ymm1 + +xor %eax,%eax +_looptop1: +#load +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa 64(%rsi),%ymm6 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vmovdqa 64(%rdx),%ymm14 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm6,%ymm7 +vpsrlq $32,%ymm10,%ymm11 +vpsrlq $32,%ymm12,%ymm13 +vmovshdup %ymm14,%ymm15 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 +vpmuldq %ymm6,%ymm14,%ymm6 +vpmuldq %ymm7,%ymm15,%ymm7 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm0,%ymm6,%ymm14 +vpmuldq %ymm0,%ymm7,%ymm15 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpmuldq %ymm1,%ymm14,%ymm14 +vpmuldq %ymm1,%ymm15,%ymm15 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsubq %ymm14,%ymm6,%ymm6 +vpsubq %ymm15,%ymm7,%ymm7 +vpsrlq $32,%ymm2,%ymm2 +vpsrlq $32,%ymm4,%ymm4 +vmovshdup %ymm6,%ymm6 + +#store +vpblendd $0xAA,%ymm3,%ymm2,%ymm2 +vpblendd $0xAA,%ymm5,%ymm4,%ymm4 +vpblendd $0xAA,%ymm7,%ymm6,%ymm6 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) +vmovdqa %ymm6,64(%rdi) + +add $96,%rdi +add $96,%rsi +add $96,%rdx +add $1,%eax +cmp $10,%eax +jb _looptop1 + +vmovdqa (%rsi),%ymm2 +vmovdqa 32(%rsi),%ymm4 +vmovdqa (%rdx),%ymm10 +vmovdqa 32(%rdx),%ymm12 +vpsrlq $32,%ymm2,%ymm3 +vpsrlq $32,%ymm4,%ymm5 +vmovshdup %ymm10,%ymm11 +vmovshdup %ymm12,%ymm13 + +#mul +vpmuldq %ymm2,%ymm10,%ymm2 +vpmuldq %ymm3,%ymm11,%ymm3 +vpmuldq %ymm4,%ymm12,%ymm4 +vpmuldq %ymm5,%ymm13,%ymm5 + +#reduce +vpmuldq %ymm0,%ymm2,%ymm10 +vpmuldq %ymm0,%ymm3,%ymm11 +vpmuldq %ymm0,%ymm4,%ymm12 +vpmuldq %ymm0,%ymm5,%ymm13 +vpmuldq %ymm1,%ymm10,%ymm10 +vpmuldq %ymm1,%ymm11,%ymm11 +vpmuldq %ymm1,%ymm12,%ymm12 +vpmuldq %ymm1,%ymm13,%ymm13 +vpsubq %ymm10,%ymm2,%ymm2 +vpsubq %ymm11,%ymm3,%ymm3 +vpsubq %ymm12,%ymm4,%ymm4 +vpsubq %ymm13,%ymm5,%ymm5 +vpsrlq $32,%ymm2,%ymm2 +vmovshdup %ymm4,%ymm4 + +#store +vpblendd $0x55,%ymm2,%ymm3,%ymm2 +vpblendd $0x55,%ymm4,%ymm5,%ymm4 +vmovdqa %ymm2,(%rdi) +vmovdqa %ymm4,32(%rdi) + +ret + +#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ From 1fcd573f91fa2d317c4c95fbfbf87e477d1d9817 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 3 Jul 2025 14:44:54 +0800 Subject: [PATCH 3/3] [TEST]: Apply SLOTHY to AArch64 pointwise_montgomery.S Signed-off-by: Matthias J. Kannwischer --- dev/aarch64_clean/src/pointwise_montgomery.S | 119 +++++++ mldsa/native/aarch64/src/Makefile | 42 +++ .../native/aarch64/src/pointwise_montgomery.S | 321 ++++++++++++++++-- 3 files changed, 456 insertions(+), 26 deletions(-) create mode 100644 dev/aarch64_clean/src/pointwise_montgomery.S create mode 100644 mldsa/native/aarch64/src/Makefile diff --git a/dev/aarch64_clean/src/pointwise_montgomery.S b/dev/aarch64_clean/src/pointwise_montgomery.S new file mode 100644 index 00000000..bfa1d7ad --- /dev/null +++ b/dev/aarch64_clean/src/pointwise_montgomery.S @@ -0,0 +1,119 @@ +/* Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +#include "../../../common.h" +#if defined(MLD_ARITH_BACKEND_AARCH64) + +.macro montgomery_reduce_long res, inl, inh + uzp1 t0.4s, \inl\().4s, \inh\().4s + mul t0.4s, t0.4s, modulus_twisted.4s + smlal \inl\().2d, t0.2s, modulus.2s + smlal2 \inh\().2d, t0.4s, modulus.4s + uzp2 \res\().4s, \inl\().4s, \inh\().4s +.endm + + +.macro pmull dl, dh, a, b + smull \dl\().2d, \a\().2s, \b\().2s + smull2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro pmlal dl, dh, a, b + smlal \dl\().2d, \a\().2s, \b\().2s + smlal2 \dh\().2d, \a\().4s, \b\().4s +.endm + +.macro save_vregs + sub sp, sp, #(16*4) + stp d8, d9, [sp, #16*0] + stp d10, d11, [sp, #16*1] + stp d12, d13, [sp, #16*2] + stp d14, d15, [sp, #16*3] +.endm + +.macro restore_vregs + ldp d8, d9, [sp, #16*0] + ldp d10, d11, [sp, #16*1] + ldp d12, d13, [sp, #16*2] + ldp d14, d15, [sp, #16*3] + add sp, sp, #(16*4) +.endm + +.macro push_stack + save_vregs +.endm + +.macro pop_stack + restore_vregs +.endm + +out_ptr .req x0 +a0_ptr .req x1 +b0_ptr .req x2 +count .req x3 +wtmp .req w3 + +modulus .req v0 +modulus_twisted .req v1 + +aa .req v2 +bb .req v3 +res .req v4 +resl .req v5 +resh .req v6 +t0 .req v7 + +q_aa .req q2 +q_bb .req q3 +q_res .req q4 + +.text +.global MLD_ASM_NAMESPACE(mld_pointwise_montgomery_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm) + push_stack + + // load q = 8380417 + movz wtmp, #57345 + movk wtmp, #127, lsl #16 + dup modulus.4s, wtmp + + // load -q^-1 = 4236238847 + movz wtmp, #57343 + movk wtmp, #64639, lsl #16 + dup modulus_twisted.4s, wtmp + mov count, #(MLDSA_N / 4) +loop_start: + + + ldr q_aa, [a0_ptr], #64 + ldr q_bb, [b0_ptr], #64 + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr], #64 + + ldr q_aa, [a0_ptr, #-48] + ldr q_bb, [b0_ptr, #-48] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-48] + + ldr q_aa, [a0_ptr, #-32] + ldr q_bb, [b0_ptr, #-32] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-32] + + ldr q_aa, [a0_ptr, #-16] + ldr q_bb, [b0_ptr, #-16] + pmull resl, resh, aa, bb + montgomery_reduce_long res, resl, resh + str q_res, [out_ptr, #-16] + + subs count, count, #4 + cbnz count, loop_start + + pop_stack + ret +#endif /* MLD_ARITH_BACKEND_AARCH64 */ diff --git a/mldsa/native/aarch64/src/Makefile b/mldsa/native/aarch64/src/Makefile new file mode 100644 index 00000000..db8f5554 --- /dev/null +++ b/mldsa/native/aarch64/src/Makefile @@ -0,0 +1,42 @@ +# Copyright (c) The mldsa-native project authors +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + +###### +# To run, see the README.md file +###### +.PHONY: all clean + +# ISA to optimize for +TARGET_ISA=Arm_AArch64 + +# MicroArch target to optimize for +TARGET_MICROARCH=Arm_Neoverse_N1_experimental + +SLOTHY_EXTRA_FLAGS ?= + +SLOTHY_FLAGS=-c sw_pipelining.enabled=true \ + -c inputs_are_outputs \ + -c sw_pipelining.minimize_overlapping=False \ + -c sw_pipelining.allow_post \ + -c variable_size \ + -c constraints.stalls_first_attempt=64 \ + $(SLOTHY_EXTRA_FLAGS) + +# For kernels which stash callee-saved v8-v15 but don't stash callee-saved GPRs x19-x30. +# Allow SLOTHY to use all V-registers, but only caller-saved GPRs. +RESERVE_X_ONLY_FLAG=-c reserved_regs="[x18--x30,sp]" + +# Used for kernels which don't stash callee-saved registers. +# Restrict SLOTHY to caller-saved registers. +RESERVE_ALL_FLAG=-c reserved_regs="[x18--x30,sp,v8--v15]" + +all: pointwise_montgomery.S + +# These units explicitly save and restore registers v8-v15, so SLOTHY can freely use +# those registers. +pointwise_montgomery.S: ../../../../dev/aarch64_clean/src/pointwise_montgomery.S + slothy-cli $(TARGET_ISA) $(TARGET_MICROARCH) $< -o $@ -l loop_start $(SLOTHY_FLAGS) $(RESERVE_X_ONLY_FLAG) + + +clean: + -$(RM) -rf pointwise_montgomery.S diff --git a/mldsa/native/aarch64/src/pointwise_montgomery.S b/mldsa/native/aarch64/src/pointwise_montgomery.S index bfa1d7ad..f613f33a 100644 --- a/mldsa/native/aarch64/src/pointwise_montgomery.S +++ b/mldsa/native/aarch64/src/pointwise_montgomery.S @@ -84,35 +84,304 @@ MLD_ASM_FN_SYMBOL(mld_pointwise_montgomery_asm) movk wtmp, #64639, lsl #16 dup modulus_twisted.4s, wtmp mov count, #(MLDSA_N / 4) + // Instructions: 40 + // Expected cycles: 28 + // Expected IPC: 1.43 + // + // Cycle bound: 28.0 + // IPC bound: 1.43 + // + // Wall time: 0.19s + // User time: 0.19s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q19, [x1], #64 // *............................. + ldr q14, [x2], #64 // *............................. + ldr q12, [x1, #-48] // .*............................ + ldr q6, [x2, #-48] // .*............................ + ldr q29, [x2, #-16] // ..*........................... + ldr q25, [x1, #-16] // ..*........................... + ldr q4, [x2, #16] // ...*.......................... + ldr q31, [x1, #16] // ...*.......................... + ldr q17, [x1, #-32] // ....*......................... + smull v5.2D, v19.2S, v14.2S // ....*......................... + smull2 v28.2D, v19.4S, v14.4S // .....*........................ + ldr q20, [x2, #-32] // .....*........................ + smull v22.2D, v12.2S, v6.2S // ......*....................... + ldr q16, [x1], #64 // ......*....................... + smull2 v8.2D, v12.4S, v6.4S // .......*...................... + ldr q2, [x2], #64 // .......*...................... + smull2 v23.2D, v25.4S, v29.4S // ........*..................... + smull v29.2D, v25.2S, v29.2S // .........*.................... + uzp1 v10.4S, v5.4S, v28.4S // .........*.................... + smull v11.2D, v31.2S, v4.2S // ..........*................... + mul v30.4S, v10.4S, v1.4S // ...........*.................. + uzp1 v7.4S, v22.4S, v8.4S // ...........*.................. + smull2 v18.2D, v31.4S, v4.4S // .............*................ + uzp1 v26.4S, v29.4S, v23.4S // .............*................ + mul v19.4S, v7.4S, v1.4S // ..............*............... + smull v14.2D, v17.2S, v20.2S // ................*............. + smull2 v17.2D, v17.4S, v20.4S // .................*............ + uzp1 v15.4S, v11.4S, v18.4S // .................*............ + smlal v5.2D, v30.2S, v0.2S // ..................*........... + mul v4.4S, v26.4S, v1.4S // ...................*.......... + smlal2 v28.2D, v30.4S, v0.4S // .....................*........ + uzp1 v12.4S, v14.4S, v17.4S // .....................*........ + smlal v22.2D, v19.2S, v0.2S // ......................*....... + smlal2 v8.2D, v19.4S, v0.4S // .......................*...... + smlal2 v23.2D, v4.4S, v0.4S // ........................*..... + smlal v29.2D, v4.2S, v0.2S // .........................*.... + uzp2 v28.4S, v5.4S, v28.4S // .........................*.... + smull v26.2D, v16.2S, v2.2S // ..........................*... + mul v5.4S, v15.4S, v1.4S // ...........................*.. + uzp2 v31.4S, v22.4S, v8.4S // ...........................*.. + + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q2, [x2], #64 // *.............................. + // ldr q16, [x1], #64 // *.............................. + // ldr q25, [x1, #-48] // .*............................. + // ldr q15, [x2, #-48] // .*............................. + // smull v11.2D, v25.2S, v15.2S // ......*........................ + // smull2 v18.2D, v25.4S, v15.4S // .......*....................... + // uzp1 v9.4S, v11.4S, v18.4S // ...........*................... + // smull v26.2D, v16.2S, v2.2S // ....*.......................... + // mul v5.4S, v9.4S, v1.4S // ..............*................ + // ldr q20, [x1, #-16] // ..*............................ + // ldr q19, [x2, #-16] // ..*............................ + // smull2 v8.2D, v16.4S, v2.4S // .....*......................... + // ldr q2, [x2], #64 // .......*....................... + // ldr q16, [x1], #64 // ......*........................ + // ldr q25, [x1, #-48] // ...*........................... + // smull v29.2D, v20.2S, v19.2S // .........*..................... + // smull2 v23.2D, v20.4S, v19.4S // ........*...................... + // uzp1 v24.4S, v26.4S, v8.4S // .........*..................... + // smlal v11.2D, v5.2S, v0.2S // ......................*........ + // mul v22.4S, v24.4S, v1.4S // ...........*................... + // ldr q15, [x2, #-48] // ...*........................... + // ldr q30, [x1, #-96] // ....*.......................... + // uzp1 v19.4S, v29.4S, v23.4S // .............*................. + // smlal2 v18.2D, v5.4S, v0.4S // .......................*....... + // smlal2 v8.2D, v22.4S, v0.4S // .....................*......... + // mul v19.4S, v19.4S, v1.4S // ...................*........... + // ldr q6, [x2, #-96] // .....*......................... + // uzp2 v31.4S, v11.4S, v18.4S // ...........................*... + // smull v11.2D, v25.2S, v15.2S // ..........*.................... + // smull2 v18.2D, v25.4S, v15.4S // .............*................. + // smlal v26.2D, v22.2S, v0.2S // ..................*............ + // smull v14.2D, v30.2S, v6.2S // ................*.............. + // smull2 v17.2D, v30.4S, v6.4S // .................*............. + // smlal v29.2D, v19.2S, v0.2S // .........................*..... + // uzp1 v9.4S, v11.4S, v18.4S // .................*............. + // smlal2 v23.2D, v19.4S, v0.4S // ........................*...... + // uzp2 v28.4S, v26.4S, v8.4S // .........................*..... + // smull v26.2D, v16.2S, v2.2S // ..........................*.... + // mul v5.4S, v9.4S, v1.4S // ...........................*... + // uzp1 v12.4S, v14.4S, v17.4S // .....................*......... + + sub count, count, #8 loop_start: + // Instructions: 40 + // Expected cycles: 24 + // Expected IPC: 1.67 + // + // Cycle bound: 24.0 + // IPC bound: 1.67 + // + // Wall time: 0.53s + // User time: 0.53s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + ldr q20, [x1, #-16] // *............................. + ldr q19, [x2, #-16] // *............................. + smull2 v8.2D, v16.4S, v2.4S // .*............................ + ldr q2, [x2], #64 // .e............................ + mul v3.4S, v12.4S, v1.4S // ..l........................... + ldr q16, [x1], #64 // ..e........................... + ldr q25, [x1, #-48] // ...e.......................... + str q31, [x0, #16] // ...l.......................... + uzp2 v4.4S, v29.4S, v23.4S // ....l......................... + smull v29.2D, v20.2S, v19.2S // ....*......................... + smull2 v23.2D, v20.4S, v19.4S // .....*........................ + uzp1 v24.4S, v26.4S, v8.4S // .....*........................ + smlal v11.2D, v5.2S, v0.2S // ......*....................... + str q28, [x0], #64 // ......l....................... + mul v22.4S, v24.4S, v1.4S // .......*...................... + ldr q15, [x2, #-48] // .......e...................... + ldr q30, [x1, #-96] // ........*..................... + smlal v14.2D, v3.2S, v0.2S // .........l.................... + uzp1 v19.4S, v29.4S, v23.4S // .........*.................... + smlal2 v18.2D, v5.4S, v0.4S // ..........*................... + str q4, [x0, #-16] // ..........l................... + smlal2 v17.2D, v3.4S, v0.4S // ...........l.................. + smlal2 v8.2D, v22.4S, v0.4S // ............*................. + mul v19.4S, v19.4S, v1.4S // .............*................ + ldr q6, [x2, #-96] // ..............*............... + uzp2 v31.4S, v11.4S, v18.4S // ...............*.............. + smull v11.2D, v25.2S, v15.2S // ...............e.............. + smull2 v18.2D, v25.4S, v15.4S // ................e............. + uzp2 v21.4S, v14.4S, v17.4S // ................l............. + smlal v26.2D, v22.2S, v0.2S // .................*............ + smull v14.2D, v30.2S, v6.2S // ..................*........... + str q21, [x0, #-32] // ..................l........... + smull2 v17.2D, v30.4S, v6.4S // ...................*.......... + smlal v29.2D, v19.2S, v0.2S // ....................*......... + uzp1 v9.4S, v11.4S, v18.4S // ....................e......... + smlal2 v23.2D, v19.4S, v0.4S // .....................*........ + uzp2 v28.4S, v26.4S, v8.4S // .....................*........ + smull v26.2D, v16.2S, v2.2S // ......................e....... + mul v5.4S, v9.4S, v1.4S // .......................e...... + uzp1 v12.4S, v14.4S, v17.4S // .......................*...... + + // ----------------------- cycle (expected) ------------------------> + // 0 25 50 + // |------------------------|------------------------|--------------- + // ldr q2, [x1], #64 // .e.....................'.~.....................'.~................ + // ldr q3, [x2], #64 // e......................'~......................'~................. + // smull v5.2d, v2.2s, v3.2s // .....................e.'.....................~.'.................. + // smull2 v6.2d, v2.4s, v3.4s // ~......................'*......................'~................. + // uzp1 v7.4s, v5.4s, v6.4s // ....~..................'....*..................'....~............. + // mul v7.4s, v7.4s, v1.4s // ......~................'......*................'......~........... + // smlal v5.2d, v7.2s, v0.2s // ................~......'................*......'................~. + // smlal2 v6.2d, v7.4s, v0.4s // ...........~...........'...........*...........'...........~...... + // uzp2 v4.4s, v5.4s, v6.4s // ....................~..'....................*..'.................. + // str q4, [x0], #64 // .....~.................'.....~.................'.....l............ + // ldr q2, [x1, #-48] // ..e....................'..~....................'..~............... + // ldr q3, [x2, #-48] // ......e................'......~................'......~........... + // smull v5.2d, v2.2s, v3.2s // ..............e........'..............~........'..............~... + // smull2 v6.2d, v2.4s, v3.4s // ...............e.......'...............~.......'...............~.. + // uzp1 v7.4s, v5.4s, v6.4s // ...................e...'...................~...'.................. + // mul v7.4s, v7.4s, v1.4s // ......................e'......................~'.................. + // smlal v5.2d, v7.2s, v0.2s // .....~.................'.....*.................'.....~............ + // smlal2 v6.2d, v7.4s, v0.4s // .........~.............'.........*.............'.........~........ + // uzp2 v4.4s, v5.4s, v6.4s // ..............~........'..............*........'..............~... + // str q4, [x0, #-48] // ..~....................'..~....................'..l............... + // ldr q2, [x1, #-32] // .......~...............'.......*...............'.......~.......... + // ldr q3, [x2, #-32] // .............~.........'.............*.........'.............~.... + // smull v5.2d, v2.2s, v3.2s // .................~.....'.................*.....'.................. + // smull2 v6.2d, v2.4s, v3.4s // ..................~....'..................*....'.................. + // uzp1 v7.4s, v5.4s, v6.4s // ......................~'......................*'.................. + // mul v7.4s, v7.4s, v1.4s // .~.....................'.~.....................'.l................ + // smlal v5.2d, v7.2s, v0.2s // ........~..............'........~..............'........l......... + // smlal2 v6.2d, v7.4s, v0.4s // ..........~............'..........~............'..........l....... + // uzp2 v4.4s, v5.4s, v6.4s // ...............~.......'...............~.......'...............l.. + // str q4, [x0, #-32] // .................~.....'.................~.....'.................l + // ldr q2, [x1, #-16] // .......................*.......................~.................. + // ldr q3, [x2, #-16] // .......................*.......................~.................. + // smull v5.2d, v2.2s, v3.2s // ...~...................'...*...................'...~.............. + // smull2 v6.2d, v2.4s, v3.4s // ....~..................'....*..................'....~............. + // uzp1 v7.4s, v5.4s, v6.4s // ........~..............'........*..............'........~......... + // mul v7.4s, v7.4s, v1.4s // ............~..........'............*..........'............~..... + // smlal v5.2d, v7.2s, v0.2s // ...................~...'...................*...'.................. + // smlal2 v6.2d, v7.4s, v0.4s // ....................~..'....................*..'.................. + // uzp2 v4.4s, v5.4s, v6.4s // ...~...................'...~...................'...l.............. + // str q4, [x0, #-16] // .........~.............'.........~.............'.........l........ + + subs count, count, 4 + cbnz count, loop_start + // Instructions: 40 + // Expected cycles: 29 + // Expected IPC: 1.38 + // + // Cycle bound: 29.0 + // IPC bound: 1.38 + // + // Wall time: 0.21s + // User time: 0.21s + // + // ----- cycle (expected) ------> + // 0 25 + // |------------------------|---- + uzp2 v22.4S, v29.4S, v23.4S // *............................. + mul v19.4S, v12.4S, v1.4S // *............................. + ldr q27, [x1, #-16] // .*............................ + ldr q13, [x2, #-16] // .*............................ + str q28, [x0], #64 // ..*........................... + smlal v11.2D, v5.2S, v0.2S // ..*........................... + smlal2 v18.2D, v5.4S, v0.4S // ...*.......................... + ldr q29, [x1, #-32] // ...*.......................... + smull2 v16.2D, v16.4S, v2.4S // ....*......................... + ldr q8, [x2, #-32] // ....*......................... + str q22, [x0, #-16] // .....*........................ + smull2 v5.2D, v27.4S, v13.4S // .....*........................ + smlal2 v17.2D, v19.4S, v0.4S // ......*....................... + str q31, [x0, #-48] // ......*....................... + uzp2 v3.4S, v11.4S, v18.4S // .......*...................... + smull v11.2D, v27.2S, v13.2S // .......*...................... + uzp1 v24.4S, v26.4S, v16.4S // ........*..................... + smull v22.2D, v29.2S, v8.2S // ........*..................... + smull2 v12.2D, v29.4S, v8.4S // .........*.................... + str q3, [x0, #16] // .........*.................... + smlal v14.2D, v19.2S, v0.2S // ..........*................... + mul v28.4S, v24.4S, v1.4S // ...........*.................. + uzp1 v4.4S, v11.4S, v5.4S // ...........*.................. + mul v31.4S, v4.4S, v1.4S // .............*................ + uzp1 v30.4S, v22.4S, v12.4S // .............*................ + uzp2 v21.4S, v14.4S, v17.4S // ..............*............... + mul v6.4S, v30.4S, v1.4S // ...............*.............. + str q21, [x0, #-32] // ................*............. + smlal v26.2D, v28.2S, v0.2S // .................*............ + smlal2 v16.2D, v28.4S, v0.4S // ..................*........... + smlal2 v5.2D, v31.4S, v0.4S // ...................*.......... + smlal v11.2D, v31.2S, v0.2S // ....................*......... + smlal2 v12.2D, v6.4S, v0.4S // .....................*........ + smlal v22.2D, v6.2S, v0.2S // ......................*....... + uzp2 v10.4S, v26.4S, v16.4S // ......................*....... + uzp2 v13.4S, v11.4S, v5.4S // ........................*..... + str q10, [x0], #64 // ........................*..... + str q13, [x0, #-16] // ..........................*... + uzp2 v13.4S, v22.4S, v12.4S // ..........................*... + str q13, [x0, #-32] // ............................*. + // ------ cycle (expected) ------> + // 0 25 + // |------------------------|----- + // ldr q20, [x1, #-16] // .*............................. + // ldr q19, [x2, #-16] // .*............................. + // smull2 v8.2D, v16.4S, v2.4S // ....*.......................... + // mul v3.4S, v12.4S, v1.4S // *.............................. + // str q31, [x0, #16] // ......*........................ + // uzp2 v4.4S, v29.4S, v23.4S // *.............................. + // smull v29.2D, v20.2S, v19.2S // .......*....................... + // smull2 v23.2D, v20.4S, v19.4S // .....*......................... + // uzp1 v24.4S, v26.4S, v8.4S // ........*...................... + // smlal v11.2D, v5.2S, v0.2S // ..*............................ + // str q28, [x0], #64 // ..*............................ + // mul v22.4S, v24.4S, v1.4S // ...........*................... + // ldr q30, [x1, #-32] // ...*........................... + // smlal v14.2D, v3.2S, v0.2S // ..........*.................... + // uzp1 v19.4S, v29.4S, v23.4S // ...........*................... + // smlal2 v18.2D, v5.4S, v0.4S // ...*........................... + // str q4, [x0, #-16] // .....*......................... + // smlal2 v17.2D, v3.4S, v0.4S // ......*........................ + // smlal2 v8.2D, v22.4S, v0.4S // ..................*............ + // mul v19.4S, v19.4S, v1.4S // .............*................. + // ldr q6, [x2, #-32] // ....*.......................... + // uzp2 v31.4S, v11.4S, v18.4S // .......*....................... + // uzp2 v21.4S, v14.4S, v17.4S // ..............*................ + // smlal v26.2D, v22.2S, v0.2S // .................*............. + // smull v14.2D, v30.2S, v6.2S // ........*...................... + // str q21, [x0, #-32] // ................*.............. + // smull2 v17.2D, v30.4S, v6.4S // .........*..................... + // smlal v29.2D, v19.2S, v0.2S // ....................*.......... + // smlal2 v23.2D, v19.4S, v0.4S // ...................*........... + // uzp2 v28.4S, v26.4S, v8.4S // ......................*........ + // uzp1 v12.4S, v14.4S, v17.4S // .............*................. + // mul v3.4S, v12.4S, v1.4S // ...............*............... + // str q31, [x0, #16] // .........*..................... + // uzp2 v4.4S, v29.4S, v23.4S // ........................*...... + // str q28, [x0], #64 // ........................*...... + // smlal v14.2D, v3.2S, v0.2S // ......................*........ + // str q4, [x0, #-16] // ..........................*.... + // smlal2 v17.2D, v3.4S, v0.4S // .....................*......... + // uzp2 v21.4S, v14.4S, v17.4S // ..........................*.... + // str q21, [x0, #-32] // ............................*.. - ldr q_aa, [a0_ptr], #64 - ldr q_bb, [b0_ptr], #64 - pmull resl, resh, aa, bb - montgomery_reduce_long res, resl, resh - str q_res, [out_ptr], #64 - - ldr q_aa, [a0_ptr, #-48] - ldr q_bb, [b0_ptr, #-48] - pmull resl, resh, aa, bb - montgomery_reduce_long res, resl, resh - str q_res, [out_ptr, #-48] - - ldr q_aa, [a0_ptr, #-32] - ldr q_bb, [b0_ptr, #-32] - pmull resl, resh, aa, bb - montgomery_reduce_long res, resl, resh - str q_res, [out_ptr, #-32] - - ldr q_aa, [a0_ptr, #-16] - ldr q_bb, [b0_ptr, #-16] - pmull resl, resh, aa, bb - montgomery_reduce_long res, resl, resh - str q_res, [out_ptr, #-16] - - subs count, count, #4 - cbnz count, loop_start pop_stack ret