From 95b570d569f9a24db2544512f2eeb0a96c8a6153 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 4 Aug 2025 14:38:17 +0800 Subject: [PATCH 1/3] AVX2: Add native implementation of poly_decompose This adds the AVX2 intrinsics implementation of poly_decompose from https://github.com/pq-crystals/dilithium/blob/master/avx2/rounding.c. Resolves https://github.com/pq-code-package/mldsa-native/issues/399. Signed-off-by: Matthias J. Kannwischer --- mldsa/native/api.h | 43 ++++++++++++ mldsa/native/x86_64/meta.h | 14 ++++ mldsa/native/x86_64/src/arith_native_x86_64.h | 6 ++ .../x86_64/src/poly_decompose_32_avx2.c | 66 ++++++++++++++++++ .../x86_64/src/poly_decompose_88_avx2.c | 68 +++++++++++++++++++ mldsa/poly.c | 14 +++- 6 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 mldsa/native/x86_64/src/poly_decompose_32_avx2.c create mode 100644 mldsa/native/x86_64/src/poly_decompose_88_avx2.c diff --git a/mldsa/native/api.h b/mldsa/native/api.h index 688f62a1..3afa66b6 100644 --- a/mldsa/native/api.h +++ b/mldsa/native/api.h @@ -168,4 +168,47 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, unsigned buflen); #endif /* MLD_USE_NATIVE_REJ_UNIFORM_ETA4 */ +#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) +/************************************************* + * Name: mld_poly_decompose_32_native + * + * Description: Native implementation of poly_decompose for GAMMA2 = (Q-1)/32. + * For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such + * c mod MLDSA_Q = c1*(2*GAMMA2) + c0 + * with -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except + * c1 = (MLDSA_Q-1)/(2*GAMMA2) where we set + * c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. + * Assumes coefficients to be standard representatives. + * + * Arguments: - int32_t *a1: output polynomial with coefficients c1 + * - int32_t *a0: output polynomial with coefficients c0 + * - const int32_t *a: input polynomial + **************************************************/ +static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, + const int32_t *a); +#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_32 */ + +#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) +/************************************************* + * Name: mld_poly_decompose_88_native + * + * Description: Native implementation of poly_decompose for GAMMA2 = (Q-1)/88. + * For all coefficients c of the input polynomial, + * compute high and low bits c0, c1 such + * c mod MLDSA_Q = c1*(2*GAMMA2) + c0 + * with -(2*GAMMA2)/2 < c0 <= (2*GAMMA2)/2 except + * c1 = (MLDSA_Q-1)/(2*GAMMA2) where we set + * c1 = 0 and -(2*GAMMA2)/2 <= c0 = c mod MLDSA_Q - MLDSA_Q < 0. + * Assumes coefficients to be standard representatives. + * + * Arguments: - int32_t *a1: output polynomial with coefficients c1 + * - int32_t *a0: output polynomial with coefficients c0 + * - const int32_t *a: input polynomial + **************************************************/ +static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, + const int32_t *a); +#endif /* MLD_USE_NATIVE_POLY_DECOMPOSE_88 */ + + #endif /* !MLD_NATIVE_API_H */ diff --git a/mldsa/native/x86_64/meta.h b/mldsa/native/x86_64/meta.h index 8bc21317..b2630e9b 100644 --- a/mldsa/native/x86_64/meta.h +++ b/mldsa/native/x86_64/meta.h @@ -17,6 +17,8 @@ #define MLD_USE_NATIVE_REJ_UNIFORM #define MLD_USE_NATIVE_REJ_UNIFORM_ETA2 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA4 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_32 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_88 #if !defined(__ASSEMBLER__) #include @@ -98,6 +100,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return outlen; } +static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_32_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); +} + +static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_88_avx2((__m256i *)a1, (__m256i *)a0, (const __m256i *)a); +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_NATIVE_X86_64_META_H */ diff --git a/mldsa/native/x86_64/src/arith_native_x86_64.h b/mldsa/native/x86_64/src/arith_native_x86_64.h index fd3369bb..e785734c 100644 --- a/mldsa/native/x86_64/src/arith_native_x86_64.h +++ b/mldsa/native/x86_64/src/arith_native_x86_64.h @@ -54,4 +54,10 @@ unsigned mld_rej_uniform_eta2_avx2( unsigned mld_rej_uniform_eta4_avx2( int32_t *r, const uint8_t buf[MLD_AVX2_REJ_UNIFORM_ETA4_BUFLEN]); +#define mld_poly_decompose_32_avx2 MLD_NAMESPACE(mld_poly_decompose_32_avx2) +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a); + +#define mld_poly_decompose_88_avx2 MLD_NAMESPACE(mld_poly_decompose_88_avx2) +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a); + #endif /* !MLD_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */ diff --git a/mldsa/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/native/x86_64/src/poly_decompose_32_avx2.c new file mode 100644 index 00000000..f870b408 --- /dev/null +++ b/mldsa/native/x86_64/src/poly_decompose_32_avx2.c @@ -0,0 +1,66 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +{ + unsigned int i; + __m256i f, f0, f1; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(1025); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(512); + const __m256i mask = _mm256_set1_epi32(15); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + f1 = _mm256_and_si256(f1, mask); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} + +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_decompose) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/mldsa/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/native/x86_64/src/poly_decompose_88_avx2.c new file mode 100644 index 00000000..e83a6fd7 --- /dev/null +++ b/mldsa/native/x86_64/src/poly_decompose_88_avx2.c @@ -0,0 +1,68 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ + +/* + * This file is derived from the public domain + * AVX2 Dilithium implementation @[REF_AVX2]. + */ + +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \ + !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +#include +#include +#include +#include "arith_native_x86_64.h" +#include "consts.h" + + +#define _mm256_blendv_epi32(a, b, mask) \ + _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a), \ + _mm256_castsi256_ps(b), \ + _mm256_castsi256_ps(mask))) + +void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a) +{ + unsigned int i; + __m256i f, f0, f1, t; + const __m256i q = + _mm256_load_si256(&mld_qdata.vec[MLD_AVX2_BACKEND_DATA_OFFSET_8XQ / 8]); + const __m256i hq = _mm256_srli_epi32(q, 1); + const __m256i v = _mm256_set1_epi32(11275); + const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2); + const __m256i off = _mm256_set1_epi32(127); + const __m256i shift = _mm256_set1_epi32(128); + const __m256i max = _mm256_set1_epi32(43); + const __m256i zero = _mm256_setzero_si256(); + + for (i = 0; i < MLDSA_N / 8; i++) + { + f = _mm256_load_si256(&a[i]); + f1 = _mm256_add_epi32(f, off); + f1 = _mm256_srli_epi32(f1, 7); + f1 = _mm256_mulhi_epu16(f1, v); + f1 = _mm256_mulhrs_epi16(f1, shift); + t = _mm256_sub_epi32(max, f1); + f1 = _mm256_blendv_epi32(f1, zero, t); + f0 = _mm256_mullo_epi32(f1, alpha); + f0 = _mm256_sub_epi32(f, f0); + f = _mm256_cmpgt_epi32(f0, hq); + f = _mm256_and_si256(f, q); + f0 = _mm256_sub_epi32(f0, f); + _mm256_store_si256(&a1[i], f1); + _mm256_store_si256(&a0[i], f0); + } +} +#undef _mm256_blendv_epi32 + +#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \ + */ + +MLD_EMPTY_CU(avx2_poly_decompose) + +#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \ + !MLD_CONFIG_MULTILEVEL_NO_SHARED) */ diff --git a/mldsa/poly.c b/mldsa/poly.c index 90092a8f..49e4c168 100644 --- a/mldsa/poly.c +++ b/mldsa/poly.c @@ -174,9 +174,19 @@ void mld_poly_power2round(mld_poly *a1, mld_poly *a0, const mld_poly *a) void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) { +#if defined(MLD_USE_NATIVE_POLY_DECOMPOSE_88) && MLDSA_MODE == 2 + /* TODO: proof */ + mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); + mld_poly_decompose_88_native(a1->coeffs, a0->coeffs, a->coeffs); +#elif defined(MLD_USE_NATIVE_POLY_DECOMPOSE_32) && \ + (MLDSA_MODE == 3 || MLDSA_MODE == 5) + /* TODO: proof */ + mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); + mld_poly_decompose_32_native(a1->coeffs, a0->coeffs, a->coeffs); +#else /* !None && MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLDSA_MODE == 3 || \ + MLDSA_MODE == 5) */ unsigned int i; mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); - for (i = 0; i < MLDSA_N; ++i) __loop__( assigns(i, memory_slice(a0, sizeof(mld_poly)), memory_slice(a1, sizeof(mld_poly))) @@ -187,6 +197,8 @@ void mld_poly_decompose(mld_poly *a1, mld_poly *a0, const mld_poly *a) { mld_decompose(&a0->coeffs[i], &a1->coeffs[i], a->coeffs[i]); } +#endif /* !None && !(MLD_USE_NATIVE_POLY_DECOMPOSE_32 && (MLDSA_MODE == 3 || \ + MLDSA_MODE == 5)) */ mld_assert_abs_bound(a0->coeffs, MLDSA_N, MLDSA_GAMMA2 + 1); mld_assert_bound(a1->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); From 7e2191f045f7f85a71fc9b73b5cba896dfb139f1 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Mon, 4 Aug 2025 17:47:27 +0800 Subject: [PATCH 2/3] AArch64: Add native implementation of poly_decompose This add a native implementation of poly_decompose written from scratch. Resolves https://github.com/pq-code-package/mldsa-native/issues/397 Signed-off-by: Matthias J. Kannwischer --- mldsa/native/aarch64/meta.h | 14 +++ .../native/aarch64/src/arith_native_aarch64.h | 6 + .../aarch64/src/poly_decompose_32_asm.S | 114 ++++++++++++++++++ .../aarch64/src/poly_decompose_88_asm.S | 112 +++++++++++++++++ 4 files changed, 246 insertions(+) create mode 100644 mldsa/native/aarch64/src/poly_decompose_32_asm.S create mode 100644 mldsa/native/aarch64/src/poly_decompose_88_asm.S diff --git a/mldsa/native/aarch64/meta.h b/mldsa/native/aarch64/meta.h index f91379f4..19a554db 100644 --- a/mldsa/native/aarch64/meta.h +++ b/mldsa/native/aarch64/meta.h @@ -13,6 +13,8 @@ #define MLD_USE_NATIVE_REJ_UNIFORM #define MLD_USE_NATIVE_REJ_UNIFORM_ETA2 #define MLD_USE_NATIVE_REJ_UNIFORM_ETA4 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_32 +#define MLD_USE_NATIVE_POLY_DECOMPOSE_88 /* Identifier for this backend so that source and assembly files * in the build can be appropriately guarded. */ @@ -93,6 +95,18 @@ static MLD_INLINE int mld_rej_uniform_eta4_native(int32_t *r, unsigned len, return outlen; } +static MLD_INLINE void mld_poly_decompose_32_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_32_asm(a1, a0, a); +} + +static MLD_INLINE void mld_poly_decompose_88_native(int32_t *a1, int32_t *a0, + const int32_t *a) +{ + mld_poly_decompose_88_asm(a1, a0, a); +} + #endif /* !__ASSEMBLER__ */ #endif /* !MLD_NATIVE_AARCH64_META_H */ diff --git a/mldsa/native/aarch64/src/arith_native_aarch64.h b/mldsa/native/aarch64/src/arith_native_aarch64.h index e011ab4c..1b506090 100644 --- a/mldsa/native/aarch64/src/arith_native_aarch64.h +++ b/mldsa/native/aarch64/src/arith_native_aarch64.h @@ -62,4 +62,10 @@ unsigned mld_rej_uniform_eta2_asm(int32_t *r, const uint8_t *buf, unsigned mld_rej_uniform_eta4_asm(int32_t *r, const uint8_t *buf, unsigned buflen, const uint8_t *table); +#define mld_poly_decompose_32_asm MLD_NAMESPACE(poly_decompose_32_asm) +void mld_poly_decompose_32_asm(int32_t *a1, int32_t *a0, const int32_t *a); + +#define mld_poly_decompose_88_asm MLD_NAMESPACE(poly_decompose_88_asm) +void mld_poly_decompose_88_asm(int32_t *a1, int32_t *a0, const int32_t *a); + #endif /* !MLD_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */ diff --git a/mldsa/native/aarch64/src/poly_decompose_32_asm.S b/mldsa/native/aarch64/src/poly_decompose_32_asm.S new file mode 100644 index 00000000..a9cc4212 --- /dev/null +++ b/mldsa/native/aarch64/src/poly_decompose_32_asm.S @@ -0,0 +1,114 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +.macro decompose32 a1, a0, input, temp + // Step 1: Compute ceil(a / 128) using floor((a + 127) / 128) + // This is the first part of computing a1 = floor(a / (2*GAMMA2)) + // where 2*GAMMA2 = 523776. We break this into two steps: + // ceil(a / 128) followed by round(temp / 4092) + add \a1\().4s, \input\().4s, offset_127.4s + ushr \a1\().4s, \a1\().4s, #7 + + // Step 2: Barrett reduction with rounding: round(temp * 1025 / 2^22) + // This computes: round(ceil(a/128) / 4092) + // Combined: a1 ≈ round(ceil(a/128) / 4092) ≈ floor(a / 523776) + // sqrdmulh computes: (2 * temp * 524800 + 2^31) >> 32 + // which is equivalent to: (temp * 1025 + 2^21) >> 22. + sqrdmulh \a1\().4s, \a1\().4s, barrett_const.4s + + // Step 3: Mask to valid range [0, 14] since (Q-1)/(2*GAMMA2) = 15 + and \a1\().16b, \a1\().16b, mask_15.16b + + // Step 4: Compute a0 = a - a1 * 2*GAMMA2 (low part of decomposition) + mls \input\().4s, \a1\().4s, gamma2_2x.4s + + // Step 5: Conditional reduction: if a0 > (Q-1)/2 then a0 -= Q + cmgt \temp\().4s, \input\().4s, q_half.4s + and \temp\().16b, \temp\().16b, q.16b + sub \a0\().4s, \input\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_half .req v21 // (Q-1)/2 + gamma2_2x .req v22 // 2*GAMMA2 = 523776 + mask_15 .req v23 // mask = 15 + offset_127 .req v24 // offset = 127 + barrett_const .req v25 // Barrett constant = 524800 + + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_32_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_32_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + lsr w5, w4, #1 + dup q_half.4s, w5 + + movz w7, #0xfe00 + movk w7, #7, lsl #16 + dup gamma2_2x.4s, w7 + + movi mask_15.4s, #15 + movi offset_127.4s, #127 + + movz w11, #0x0200 + movk w11, #8, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) + +poly_decompose_32_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose32 v4, v5, v1, v26 + decompose32 v6, v7, v2, v26 + decompose32 v16, v17, v3, v26 + decompose32 v18, v19, v0, v26 + + + str q4, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q16, [a1_ptr, #3*16] + str q18, [a1_ptr], #4*16 + str q5, [a0_ptr, #1*16] + str q7, [a0_ptr, #2*16] + str q17, [a0_ptr, #3*16] + str q19, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_32_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_half + .unreq gamma2_2x + .unreq mask_15 + .unreq offset_127 + .unreq barrett_const + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ diff --git a/mldsa/native/aarch64/src/poly_decompose_88_asm.S b/mldsa/native/aarch64/src/poly_decompose_88_asm.S new file mode 100644 index 00000000..7ba8c4d6 --- /dev/null +++ b/mldsa/native/aarch64/src/poly_decompose_88_asm.S @@ -0,0 +1,112 @@ +/* + * Copyright (c) The mldsa-native project authors + * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT + */ +#include "../../../common.h" + +#if defined(MLD_ARITH_BACKEND_AARCH64) && !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) + +.macro decompose88 a1, a0, input, temp + // Step 1: Compute ceil(a / 128) using floor((a + 127) / 128) + // This is the first part of computing a1 = floor(a / (2*GAMMA2)) + // where 2*GAMMA2 = 190464. We break this into two steps: + // ceil(a / 128) followed by round(temp / 1488) + add \a1\().4s, \input\().4s, offset_127.4s + ushr \a1\().4s, \a1\().4s, #7 + + // Step 2: Barrett reduction with rounding: round(temp * 11275 / 2^24) + // This computes: round(ceil(a/128) / 1488) + // Combined: a1 ≈ round(ceil(a/128) / 1488) ≈ floor(a / 190464) + // sqrdmulh computes: (2 * temp * 1443201 + 2^31) >> 32 + // which is equivalent to: (temp * 11275 + 2^23) >> 24. + sqrdmulh \a1\().4s, \a1\().4s, barrett_const.4s + + // Step 3: Mask to valid range [0, 43] since (Q-1)/(2*GAMMA2) = 44 + cmhi \temp\().4s, const44.4s, \a1\().4s + and \a1\().16b, \a1\().16b, \temp\().16b + + // Step 4: Compute a0 = a - a1 * 2*GAMMA2 (low part of decomposition) + mls \input\().4s, \a1\().4s, gamma2_2x.4s + + // Step 5: Conditional reduction: if a0 > (Q-1)/2 then a0 -= Q + cmgt \temp\().4s, \input\().4s, q_half.4s + and \temp\().16b, \temp\().16b, q.16b + sub \a0\().4s, \input\().4s, \temp\().4s +.endm + + /* Parameters */ + a1_ptr .req x0 // Output polynomial with coefficients c1 + a0_ptr .req x1 // Output polynomial with coefficients c0 + a_ptr .req x2 // Input polynomial + + count .req x3 + + /* Constant register assignments */ + q .req v20 // Q = 8380417 + q_half .req v21 // (Q-1)/2 + gamma2_2x .req v22 // 2*GAMMA2 = 190464 + const44 .req v23 // const = 44 + offset_127 .req v24 // offset = 127 + barrett_const .req v25 // Barrett constant = 1443201 + +.text +.global MLD_ASM_NAMESPACE(poly_decompose_88_asm) +.balign 4 +MLD_ASM_FN_SYMBOL(poly_decompose_88_asm) + // Load constants into SIMD registers + movz w4, #57345 + movk w4, #127, lsl #16 + dup q.4s, w4 + + lsr w5, w4, #1 + dup q_half.4s, w5 + + movz w7, #0xe800 + movk w7, #0x2, lsl #16 + dup gamma2_2x.4s, w7 + + movi const44.4s, #44 + movi offset_127.4s, #127 + + movz w11, #0x0581 + movk w11, #0x16, lsl #16 + dup barrett_const.4s, w11 + + mov count, #(64/4) +poly_decompose_88_loop: + ldr q1, [a_ptr, #1*16] + ldr q2, [a_ptr, #2*16] + ldr q3, [a_ptr, #3*16] + ldr q0, [a_ptr], #4*16 + + decompose88 v4, v5, v1, v26 + decompose88 v6, v7, v2, v26 + decompose88 v16, v17, v3, v26 + decompose88 v18, v19, v0, v26 + + str q4, [a1_ptr, #1*16] + str q6, [a1_ptr, #2*16] + str q16, [a1_ptr, #3*16] + str q18, [a1_ptr], #4*16 + str q5, [a0_ptr, #1*16] + str q7, [a0_ptr, #2*16] + str q17, [a0_ptr, #3*16] + str q19, [a0_ptr], #4*16 + + subs count, count, #1 + bne poly_decompose_88_loop + + ret + + .unreq a1_ptr + .unreq a0_ptr + .unreq a_ptr + .unreq count + .unreq q + .unreq q_half + .unreq gamma2_2x + .unreq const44 + .unreq offset_127 + .unreq barrett_const + +#endif /* MLD_ARITH_BACKEND_AARCH64 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ From 30d236c79797a2f64a5c1c59220dc05209559685 Mon Sep 17 00:00:00 2001 From: "Matthias J. Kannwischer" Date: Thu, 7 Aug 2025 22:27:01 +0800 Subject: [PATCH 3/3] Implement poly_use_hint on top of poly_decompose This eliminates the helper function use_hint and instead inlines it into poly_use_hint to be able to re-use the native poly_decompose code. The disadvantage is that it requires an additional polynomial on the stack. Resolves https://github.com/pq-code-package/mldsa-native/issues/400 Resolvse https://github.com/pq-code-package/mldsa-native/issues/398 Signed-off-by: Matthias J. Kannwischer --- mldsa/poly.c | 33 ++++++++++++++- mldsa/rounding.h | 46 --------------------- proofs/cbmc/poly_use_hint/Makefile | 2 +- proofs/cbmc/use_hint/Makefile | 55 ------------------------- proofs/cbmc/use_hint/use_hint_harness.c | 11 ----- 5 files changed, 33 insertions(+), 114 deletions(-) delete mode 100644 proofs/cbmc/use_hint/Makefile delete mode 100644 proofs/cbmc/use_hint/use_hint_harness.c diff --git a/mldsa/poly.c b/mldsa/poly.c index 49e4c168..2b35c698 100644 --- a/mldsa/poly.c +++ b/mldsa/poly.c @@ -228,16 +228,47 @@ unsigned int mld_poly_make_hint(mld_poly *h, const mld_poly *a0, void mld_poly_use_hint(mld_poly *b, const mld_poly *a, const mld_poly *h) { unsigned int i; + mld_poly a0; mld_assert_bound(a->coeffs, MLDSA_N, 0, MLDSA_Q); mld_assert_bound(h->coeffs, MLDSA_N, 0, 2); + /* Reference: The reference implementation has a function use_hint operating + * on individual coefficients that first calls decompose and then uses the + * same conditionals as in the following. We instead call decompose on the + * entire polynomial with the benefit of being able to re-use the native + * poly_decompose implementation at the cost of a temporay polynomial on the + * stack. + */ + mld_poly_decompose(b, &a0, a); for (i = 0; i < MLDSA_N; ++i) __loop__( invariant(i <= MLDSA_N) invariant(array_bound(b->coeffs, 0, i, 0, (MLDSA_Q-1)/(2*MLDSA_GAMMA2))) + invariant(forall(k0, i, MLDSA_N, b->coeffs[k0] == loop_entry(*b).coeffs[k0])) ) { - b->coeffs[i] = mld_use_hint(a->coeffs[i], h->coeffs[i]); + if (h->coeffs[i] != 0) + { +#if MLDSA_MODE == 2 + if (a0.coeffs[i] > 0) + { + b->coeffs[i] = (b->coeffs[i] == 43) ? 0 : b->coeffs[i] + 1; + } + else + { + b->coeffs[i] = (b->coeffs[i] == 0) ? 43 : b->coeffs[i] - 1; + } +#else /* MLDSA_MODE == 2 */ + if (a0.coeffs[i] > 0) + { + b->coeffs[i] = (b->coeffs[i] + 1) & 15; + } + else + { + b->coeffs[i] = (b->coeffs[i] - 1) & 15; + } +#endif /* MLDSA_MODE != 2 */ + } } mld_assert_bound(b->coeffs, MLDSA_N, 0, (MLDSA_Q - 1) / (2 * MLDSA_GAMMA2)); diff --git a/mldsa/rounding.h b/mldsa/rounding.h index d410bf47..baebe906 100644 --- a/mldsa/rounding.h +++ b/mldsa/rounding.h @@ -121,51 +121,5 @@ __contract__( return 0; } -/************************************************* - * Name: mld_use_hint - * - * Description: Correct high bits according to hint. - * - * Arguments: - int32_t a: input element - * - unsigned int hint: hint bit - * - * Returns corrected high bits. - **************************************************/ -static MLD_INLINE int32_t mld_use_hint(int32_t a, unsigned int hint) -__contract__( - requires(hint >= 0 && hint <= 1) - requires(a >= 0 && a < MLDSA_Q) - ensures(return_value >= 0 && return_value < (MLDSA_Q-1)/(2*MLDSA_GAMMA2)) -) -{ - int32_t a0, a1; - - mld_decompose(&a0, &a1, a); - if (hint == 0) - { - return a1; - } - -#if MLDSA_MODE == 2 - if (a0 > 0) - { - return (a1 == 43) ? 0 : a1 + 1; - } - else - { - return (a1 == 0) ? 43 : a1 - 1; - } -#else /* MLDSA_MODE == 2 */ - if (a0 > 0) - { - return (a1 + 1) & 15; - } - else - { - return (a1 - 1) & 15; - } -#endif /* MLDSA_MODE != 2 */ -} - #endif /* !MLD_ROUNDING_H */ diff --git a/proofs/cbmc/poly_use_hint/Makefile b/proofs/cbmc/poly_use_hint/Makefile index 883fefa4..9cf289e9 100644 --- a/proofs/cbmc/poly_use_hint/Makefile +++ b/proofs/cbmc/poly_use_hint/Makefile @@ -20,7 +20,7 @@ PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c PROJECT_SOURCES += $(SRCDIR)/mldsa/poly.c CHECK_FUNCTION_CONTRACTS=$(MLD_NAMESPACE)poly_use_hint -USE_FUNCTION_CONTRACTS=mld_use_hint +USE_FUNCTION_CONTRACTS=$(MLD_NAMESPACE)poly_decompose APPLY_LOOP_CONTRACTS=on USE_DYNAMIC_FRAMES=1 diff --git a/proofs/cbmc/use_hint/Makefile b/proofs/cbmc/use_hint/Makefile deleted file mode 100644 index b06562ba..00000000 --- a/proofs/cbmc/use_hint/Makefile +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) The mldsa-native project authors -# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -include ../Makefile_params.common - -HARNESS_ENTRY = harness -HARNESS_FILE = use_hint_harness - -# This should be a unique identifier for this proof, and will appear on the -# Litani dashboard. It can be human-readable and contain spaces if you wish. -PROOF_UID = use_hint - -DEFINES += -INCLUDES += - -REMOVE_FUNCTION_BODY += -UNWINDSET += - -PROOF_SOURCES += $(PROOFDIR)/$(HARNESS_FILE).c -PROJECT_SOURCES += $(SRCDIR)/mldsa/poly.c - -CHECK_FUNCTION_CONTRACTS=mld_use_hint -USE_FUNCTION_CONTRACTS=mld_decompose -APPLY_LOOP_CONTRACTS=on -USE_DYNAMIC_FRAMES=1 - -# Disable any setting of EXTERNAL_SAT_SOLVER, and choose SMT backend instead -EXTERNAL_SAT_SOLVER= -CBMCFLAGS=--smt2 - -FUNCTION_NAME = use_hint - -# If this proof is found to consume huge amounts of RAM, you can set the -# EXPENSIVE variable. With new enough versions of the proof tools, this will -# restrict the number of EXPENSIVE CBMC jobs running at once. See the -# documentation in Makefile.common under the "Job Pools" heading for details. -# EXPENSIVE = true - -# This function is large enough to need... -CBMC_OBJECT_BITS = 8 - -# If you require access to a file-local ("static") function or object to conduct -# your proof, set the following (and do not include the original source file -# ("mldsa/poly.c") in PROJECT_SOURCES). -# REWRITTEN_SOURCES = $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i -# include ../Makefile.common -# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_SOURCE = $(SRCDIR)/mldsa/poly.c -# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_FUNCTIONS = foo bar -# $(PROOFDIR)/<__SOURCE_FILE_BASENAME__>.i_OBJECTS = baz -# Care is required with variables on the left-hand side: REWRITTEN_SOURCES must -# be set before including Makefile.common, but any use of variables on the -# left-hand side requires those variables to be defined. Hence, _SOURCE, -# _FUNCTIONS, _OBJECTS is set after including Makefile.common. - -include ../Makefile.common diff --git a/proofs/cbmc/use_hint/use_hint_harness.c b/proofs/cbmc/use_hint/use_hint_harness.c deleted file mode 100644 index e1925ddf..00000000 --- a/proofs/cbmc/use_hint/use_hint_harness.c +++ /dev/null @@ -1,11 +0,0 @@ -// Copyright (c) The mldsa-native project authors -// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT - -#include "rounding.h" - -void harness(void) -{ - int32_t a, r; - unsigned int hint; - r = mld_use_hint(a, hint); -}